NVGPU/Transforms/OptimizeSharedMemory.cpp

51b925dfSChristopher Bate//===- OptimizeSharedMemory.cpp - MLIR NVGPU pass implementation ----------===//
51b925dfSChristopher Bate//
51b925dfSChristopher Bate// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
51b925dfSChristopher Bate// See https://llvm.org/LICENSE.txt for license information.
51b925dfSChristopher Bate// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
51b925dfSChristopher Bate//
51b925dfSChristopher Bate//===----------------------------------------------------------------------===//
51b925dfSChristopher Bate//
51b925dfSChristopher Bate// This file implements transforms to optimize accesses to shared memory.
51b925dfSChristopher Bate//
51b925dfSChristopher Bate//===----------------------------------------------------------------------===//
51b925dfSChristopher Bate#include "PassDetail.h"
51b925dfSChristopher Bate#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
51b925dfSChristopher Bate#include "mlir/Dialect/GPU/IR/GPUDialect.h"
51b925dfSChristopher Bate#include "mlir/Dialect/MemRef/IR/MemRef.h"
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/Passes.h"
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/Transforms/Transforms.h"
51b925dfSChristopher Bate#include "mlir/Dialect/Vector/IR/VectorOps.h"
51b925dfSChristopher Bate#include "mlir/Interfaces/SideEffectInterfaces.h"
51b925dfSChristopher Bate#include "mlir/Support/LogicalResult.h"
51b925dfSChristopher Bate#include "llvm/ADT/STLExtras.h"
51b925dfSChristopher Bate#include "llvm/Support/MathExtras.h"
51b925dfSChristopher Bate
51b925dfSChristopher Bateusing namespace mlir;
51b925dfSChristopher Bateusing namespace mlir::nvgpu;
51b925dfSChristopher Bate
51b925dfSChristopher Bate/// The size of a shared memory line according to NV documentation.
51b925dfSChristopher Bateconstexpr int64_t kSharedMemoryLineSizeBytes = 128;
51b925dfSChristopher Bate/// We optimize for 128bit accesses, but this can be made an argument in the
51b925dfSChristopher Bate/// future.
51b925dfSChristopher Bateconstexpr int64_t kDefaultVectorSizeBits = 128;
51b925dfSChristopher Bate
51b925dfSChristopher Bate/// Uses `srcIndexValue` to permute `tgtIndexValue` via
51b925dfSChristopher Bate/// `result = xor(floordiv(srcIdxVal,permuteEveryN),
51b925dfSChristopher Bate///               floordiv(tgtIdxVal,vectorSize)))
51b925dfSChristopher Bate///            + tgtIdxVal % vectorSize`
51b925dfSChristopher Bate/// This is done using an optimized sequence of `arith` operations.
51b925dfSChristopher Batestatic Value permuteVectorOffset(OpBuilder &b, Location loc,
51b925dfSChristopher Bate                                 ArrayRef<Value> indices, MemRefType memrefTy,
51b925dfSChristopher Bate                                 int64_t srcDim, int64_t tgtDim) {
51b925dfSChristopher Bate  // Adjust the src index to change how often the permutation changes
51b925dfSChristopher Bate  // if necessary.
51b925dfSChristopher Bate  Value src = indices[srcDim];
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // We only want to permute every N iterations of the target dim where N is
51b925dfSChristopher Bate  // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
51b925dfSChristopher Bate  const int64_t permuteEveryN = std::max<int64_t>(
51b925dfSChristopher Bate      1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
51b925dfSChristopher Bate                                        memrefTy.getElementTypeBitWidth()) /
51b925dfSChristopher Bate                                       8));
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // clang-format off
51b925dfSChristopher Bate  // Index bit representation (b0 = least significant bit) for dim(1)
51b925dfSChristopher Bate  // of a `memref<?x?xDT>` is as follows:
51b925dfSChristopher Bate  // N := log2(128/elementSizeBits)
51b925dfSChristopher Bate  // M := log2(dimSize(1))
51b925dfSChristopher Bate  // then
51b925dfSChristopher Bate  // bits[0:N] = sub-vector element offset
51b925dfSChristopher Bate  // bits[N:M] = vector index
51b925dfSChristopher Bate  // clang-format on
51b925dfSChristopher Bate  int64_t N =
51b925dfSChristopher Bate      llvm::Log2_64(kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
51b925dfSChristopher Bate  int64_t M = llvm::Log2_64(memrefTy.getDimSize(tgtDim));
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
829c84ecSChristopher Bate  int64_t mask = (1LL << (M - N)) - 1;
51b925dfSChristopher Bate  if (permuteEveryN > 1)
51b925dfSChristopher Bate    mask = mask << llvm::Log2_64(permuteEveryN);
51b925dfSChristopher Bate  Value srcBits = b.create<arith::ConstantIndexOp>(loc, mask);
51b925dfSChristopher Bate  srcBits = b.create<arith::AndIOp>(loc, src, srcBits);
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // Use the src bits to permute the target bits b[N:M] containing the
51b925dfSChristopher Bate  // vector offset.
51b925dfSChristopher Bate  if (permuteEveryN > 1) {
51b925dfSChristopher Bate    int64_t shlBits = N - llvm::Log2_64(permuteEveryN);
51b925dfSChristopher Bate    if (shlBits > 0) {
51b925dfSChristopher Bate      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, shlBits);
51b925dfSChristopher Bate      srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
51b925dfSChristopher Bate    } else if (shlBits < 0) {
51b925dfSChristopher Bate      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, -1 * shlBits);
51b925dfSChristopher Bate      srcBits = b.createOrFold<arith::ShRUIOp>(loc, srcBits, finalShiftVal);
51b925dfSChristopher Bate    }
51b925dfSChristopher Bate  } else {
51b925dfSChristopher Bate    Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, N);
51b925dfSChristopher Bate    srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
51b925dfSChristopher Bate  }
51b925dfSChristopher Bate
51b925dfSChristopher Bate  Value permutedVectorIdx =
51b925dfSChristopher Bate      b.create<arith::XOrIOp>(loc, indices[tgtDim], srcBits);
51b925dfSChristopher Bate  return permutedVectorIdx;
51b925dfSChristopher Bate}
51b925dfSChristopher Bate
51b925dfSChristopher Batestatic void transformIndices(OpBuilder &builder, Location loc,
51b925dfSChristopher Bate                             SmallVector<Value, 4> &indices,
51b925dfSChristopher Bate                             MemRefType memrefTy, int64_t srcDim,
51b925dfSChristopher Bate                             int64_t tgtDim) {
51b925dfSChristopher Bate  indices[tgtDim] =
51b925dfSChristopher Bate      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim);
51b925dfSChristopher Bate}
51b925dfSChristopher Bate
51b925dfSChristopher BateOperation::operand_range getIndices(Operation *op) {
51b925dfSChristopher Bate  if (auto ldmatrixOp = dyn_cast<LdMatrixOp>(op))
8df54a6aSJacques Pienaar    return ldmatrixOp.getIndices();
51b925dfSChristopher Bate  if (auto copyOp = dyn_cast<DeviceAsyncCopyOp>(op))
8df54a6aSJacques Pienaar    return copyOp.getDstIndices();
51b925dfSChristopher Bate  if (auto loadOp = dyn_cast<memref::LoadOp>(op))
*136d746eSJacques Pienaar    return loadOp.getIndices();
51b925dfSChristopher Bate  if (auto storeOp = dyn_cast<memref::StoreOp>(op))
*136d746eSJacques Pienaar    return storeOp.getIndices();
51b925dfSChristopher Bate  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
51b925dfSChristopher Bate    return vectorReadOp.getIndices();
51b925dfSChristopher Bate  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
51b925dfSChristopher Bate    return vectorStoreOp.getIndices();
51b925dfSChristopher Bate  llvm_unreachable("unsupported op type");
51b925dfSChristopher Bate}
51b925dfSChristopher Bate
51b925dfSChristopher Batevoid setIndices(Operation *op, ArrayRef<Value> indices) {
51b925dfSChristopher Bate  if (auto ldmatrixOp = dyn_cast<LdMatrixOp>(op))
8df54a6aSJacques Pienaar    return ldmatrixOp.getIndicesMutable().assign(indices);
51b925dfSChristopher Bate  if (auto copyOp = dyn_cast<DeviceAsyncCopyOp>(op))
8df54a6aSJacques Pienaar    return copyOp.getDstIndicesMutable().assign(indices);
51b925dfSChristopher Bate  if (auto loadOp = dyn_cast<memref::LoadOp>(op))
*136d746eSJacques Pienaar    return loadOp.getIndicesMutable().assign(indices);
51b925dfSChristopher Bate  if (auto storeOp = dyn_cast<memref::StoreOp>(op))
*136d746eSJacques Pienaar    return storeOp.getIndicesMutable().assign(indices);
51b925dfSChristopher Bate  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
51b925dfSChristopher Bate    return vectorReadOp.getIndicesMutable().assign(indices);
51b925dfSChristopher Bate  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
51b925dfSChristopher Bate    return vectorStoreOp.getIndicesMutable().assign(indices);
51b925dfSChristopher Bate  llvm_unreachable("unsupported op type");
51b925dfSChristopher Bate}
51b925dfSChristopher Bate
51b925dfSChristopher Bate/// Return all operations within `parentOp` that read from or write to
51b925dfSChristopher Bate/// `shmMemRef`.
51b925dfSChristopher Batestatic LogicalResult
51b925dfSChristopher BategetShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
51b925dfSChristopher Bate                      SmallVector<Operation *, 16> &readOps,
51b925dfSChristopher Bate                      SmallVector<Operation *, 16> &writeOps) {
51b925dfSChristopher Bate  parentOp->walk([&](Operation *op) {
51b925dfSChristopher Bate    MemoryEffectOpInterface iface = dyn_cast<MemoryEffectOpInterface>(op);
51b925dfSChristopher Bate    if (!iface)
51b925dfSChristopher Bate      return;
51b925dfSChristopher Bate    Optional<MemoryEffects::EffectInstance> effect =
51b925dfSChristopher Bate        iface.getEffectOnValue<MemoryEffects::Read>(shmMemRef);
51b925dfSChristopher Bate    if (effect) {
51b925dfSChristopher Bate      readOps.push_back(op);
51b925dfSChristopher Bate      return;
51b925dfSChristopher Bate    }
51b925dfSChristopher Bate    effect = iface.getEffectOnValue<MemoryEffects::Write>(shmMemRef);
51b925dfSChristopher Bate    if (effect)
51b925dfSChristopher Bate      writeOps.push_back(op);
51b925dfSChristopher Bate  });
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // Restrict to a supported set of ops. We also require at least 2D access,
51b925dfSChristopher Bate  // although this could be relaxed.
51b925dfSChristopher Bate  if (llvm::any_of(readOps, [](Operation *op) {
51b925dfSChristopher Bate        return !isa<memref::LoadOp, vector::LoadOp, nvgpu::LdMatrixOp>(op) ||
51b925dfSChristopher Bate               getIndices(op).size() < 2;
51b925dfSChristopher Bate      }))
51b925dfSChristopher Bate    return failure();
51b925dfSChristopher Bate  if (llvm::any_of(writeOps, [](Operation *op) {
51b925dfSChristopher Bate        return !isa<memref::StoreOp, vector::StoreOp, nvgpu::DeviceAsyncCopyOp>(
51b925dfSChristopher Bate                   op) ||
51b925dfSChristopher Bate               getIndices(op).size() < 2;
51b925dfSChristopher Bate      }))
51b925dfSChristopher Bate    return failure();
51b925dfSChristopher Bate
51b925dfSChristopher Bate  return success();
51b925dfSChristopher Bate}
51b925dfSChristopher Bate
51b925dfSChristopher Batemlir::LogicalResult
51b925dfSChristopher Batemlir::nvgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
51b925dfSChristopher Bate                                                Value memrefValue) {
51b925dfSChristopher Bate  auto memRefType = memrefValue.getType().dyn_cast<MemRefType>();
51b925dfSChristopher Bate  if (!memRefType || memRefType.getMemorySpaceAsInt() !=
51b925dfSChristopher Bate                         gpu::GPUDialect::getWorkgroupAddressSpace())
51b925dfSChristopher Bate    return failure();
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // Abort if the given value has any sub-views; we do not do any alias
51b925dfSChristopher Bate  // analysis.
51b925dfSChristopher Bate  bool hasSubView = false;
51b925dfSChristopher Bate  parentOp->walk([&](memref::SubViewOp subView) { hasSubView = true; });
51b925dfSChristopher Bate  if (hasSubView)
51b925dfSChristopher Bate    return failure();
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // Check if this is necessary given the assumption of 128b accesses:
51b925dfSChristopher Bate  // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
51b925dfSChristopher Bate  const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
51b925dfSChristopher Bate  const int64_t rowsPerLine =
51b925dfSChristopher Bate      (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
51b925dfSChristopher Bate      rowSize;
51b925dfSChristopher Bate  const int64_t threadGroupSize =
829c84ecSChristopher Bate      1LL << (7 - llvm::Log2_64(kDefaultVectorSizeBits / 8));
51b925dfSChristopher Bate  if (rowsPerLine >= threadGroupSize)
51b925dfSChristopher Bate    return failure();
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // Get sets of operations within the function that read/write to shared
51b925dfSChristopher Bate  // memory.
51b925dfSChristopher Bate  SmallVector<Operation *, 16> shmReadOps;
51b925dfSChristopher Bate  SmallVector<Operation *, 16> shmWriteOps;
51b925dfSChristopher Bate  if (failed(getShmReadAndWriteOps(parentOp, memrefValue, shmReadOps,
51b925dfSChristopher Bate                                   shmWriteOps)))
51b925dfSChristopher Bate    return failure();
51b925dfSChristopher Bate
51b925dfSChristopher Bate  if (shmReadOps.empty() || shmWriteOps.empty())
51b925dfSChristopher Bate    return failure();
51b925dfSChristopher Bate
51b925dfSChristopher Bate  OpBuilder builder(parentOp->getContext());
51b925dfSChristopher Bate
51b925dfSChristopher Bate  int64_t tgtDim = memRefType.getRank() - 1;
51b925dfSChristopher Bate  int64_t srcDim = memRefType.getRank() - 2;
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // Transform indices for the ops writing to shared memory.
51b925dfSChristopher Bate  while (!shmWriteOps.empty()) {
51b925dfSChristopher Bate    Operation *shmWriteOp = shmWriteOps.back();
51b925dfSChristopher Bate    shmWriteOps.pop_back();
51b925dfSChristopher Bate    builder.setInsertionPoint(shmWriteOp);
51b925dfSChristopher Bate
51b925dfSChristopher Bate    auto indices = getIndices(shmWriteOp);
51b925dfSChristopher Bate    SmallVector<Value, 4> transformedIndices(indices.begin(), indices.end());
51b925dfSChristopher Bate    transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,
51b925dfSChristopher Bate                     memRefType, srcDim, tgtDim);
51b925dfSChristopher Bate    setIndices(shmWriteOp, transformedIndices);
51b925dfSChristopher Bate  }
51b925dfSChristopher Bate
51b925dfSChristopher Bate  // Transform indices for the ops reading from shared memory.
51b925dfSChristopher Bate  while (!shmReadOps.empty()) {
51b925dfSChristopher Bate    Operation *shmReadOp = shmReadOps.back();
51b925dfSChristopher Bate    shmReadOps.pop_back();
51b925dfSChristopher Bate    builder.setInsertionPoint(shmReadOp);
51b925dfSChristopher Bate
51b925dfSChristopher Bate    auto indices = getIndices(shmReadOp);
51b925dfSChristopher Bate    SmallVector<Value, 4> transformedIndices(indices.begin(), indices.end());
51b925dfSChristopher Bate    transformIndices(builder, shmReadOp->getLoc(), transformedIndices,
51b925dfSChristopher Bate                     memRefType, srcDim, tgtDim);
51b925dfSChristopher Bate    setIndices(shmReadOp, transformedIndices);
51b925dfSChristopher Bate  }
51b925dfSChristopher Bate
51b925dfSChristopher Bate  return success();
51b925dfSChristopher Bate}
51b925dfSChristopher Bate
51b925dfSChristopher Batenamespace {
51b925dfSChristopher Bateclass OptimizeSharedMemoryPass
51b925dfSChristopher Bate    : public OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
51b925dfSChristopher Batepublic:
51b925dfSChristopher Bate  OptimizeSharedMemoryPass() = default;
51b925dfSChristopher Bate
51b925dfSChristopher Bate  void runOnOperation() override {
51b925dfSChristopher Bate    Operation *op = getOperation();
51b925dfSChristopher Bate    SmallVector<memref::AllocOp> shmAllocOps;
51b925dfSChristopher Bate    op->walk([&](memref::AllocOp allocOp) {
*136d746eSJacques Pienaar      if (allocOp.getMemref()
*136d746eSJacques Pienaar              .getType()
*136d746eSJacques Pienaar              .cast<MemRefType>()
*136d746eSJacques Pienaar              .getMemorySpaceAsInt() !=
51b925dfSChristopher Bate          gpu::GPUDialect::getWorkgroupAddressSpace())
51b925dfSChristopher Bate        return;
51b925dfSChristopher Bate      shmAllocOps.push_back(allocOp);
51b925dfSChristopher Bate    });
51b925dfSChristopher Bate    for (auto allocOp : shmAllocOps) {
51b925dfSChristopher Bate      if (failed(optimizeSharedMemoryReadsAndWrites(getOperation(),
*136d746eSJacques Pienaar                                                    allocOp.getMemref())))
51b925dfSChristopher Bate        return;
51b925dfSChristopher Bate    }
51b925dfSChristopher Bate  }
51b925dfSChristopher Bate};
51b925dfSChristopher Bate} // namespace
51b925dfSChristopher Bate
51b925dfSChristopher Batestd::unique_ptr<Pass> mlir::nvgpu::createOptimizeSharedMemoryPass() {
51b925dfSChristopher Bate  return std::make_unique<OptimizeSharedMemoryPass>();
51b925dfSChristopher Bate}