NVGPU/IR/NVGPUDialect.cpp

4c564940SThomas Raoux//===- NVGPUDialect.cpp - MLIR NVGPU ops implementation -------------------===//
4c564940SThomas Raoux//
4c564940SThomas Raoux// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4c564940SThomas Raoux// See https://llvm.org/LICENSE.txt for license information.
4c564940SThomas Raoux// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4c564940SThomas Raoux//
4c564940SThomas Raoux//===----------------------------------------------------------------------===//
4c564940SThomas Raoux//
4c564940SThomas Raoux// This file implements the NVGPU dialect and its operations.
4c564940SThomas Raoux//
4c564940SThomas Raoux//===----------------------------------------------------------------------===//
4c564940SThomas Raoux
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
d7ef488bSMogball#include "mlir/Dialect/GPU/IR/GPUDialect.h"
4c564940SThomas Raoux#include "mlir/IR/Builders.h"
15bcc36eSThomas Raoux#include "mlir/IR/DialectImplementation.h"
4c564940SThomas Raoux#include "mlir/IR/OpImplementation.h"
4c564940SThomas Raoux#include "mlir/IR/TypeUtilities.h"
15bcc36eSThomas Raoux#include "llvm/ADT/TypeSwitch.h"
4c564940SThomas Raoux
4c564940SThomas Raouxusing namespace mlir;
15bcc36eSThomas Raouxusing namespace mlir::nvgpu;
4c564940SThomas Raoux
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"
4c564940SThomas Raoux
4c564940SThomas Raouxvoid nvgpu::NVGPUDialect::initialize() {
15bcc36eSThomas Raoux  addTypes<DeviceAsyncTokenType>();
4c564940SThomas Raoux  addOperations<
4c564940SThomas Raoux#define GET_OP_LIST
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
4c564940SThomas Raoux      >();
4c564940SThomas Raoux}
4c564940SThomas Raoux
15bcc36eSThomas RaouxType NVGPUDialect::parseType(DialectAsmParser &parser) const {
15bcc36eSThomas Raoux  // Parse the main keyword for the type.
15bcc36eSThomas Raoux  StringRef keyword;
15bcc36eSThomas Raoux  if (parser.parseKeyword(&keyword))
15bcc36eSThomas Raoux    return Type();
15bcc36eSThomas Raoux  MLIRContext *context = getContext();
15bcc36eSThomas Raoux  // Handle 'device async token' types.
15bcc36eSThomas Raoux  if (keyword == "device.async.token")
15bcc36eSThomas Raoux    return DeviceAsyncTokenType::get(context);
15bcc36eSThomas Raoux
15bcc36eSThomas Raoux  parser.emitError(parser.getNameLoc(), "unknown nvgpu type: " + keyword);
15bcc36eSThomas Raoux  return Type();
15bcc36eSThomas Raoux}
15bcc36eSThomas Raoux
15bcc36eSThomas Raouxvoid NVGPUDialect::printType(Type type, DialectAsmPrinter &os) const {
15bcc36eSThomas Raoux  TypeSwitch<Type>(type)
15bcc36eSThomas Raoux      .Case<DeviceAsyncTokenType>([&](Type) { os << "device.async.token"; })
15bcc36eSThomas Raoux      .Default([](Type) { llvm_unreachable("unexpected 'nvgpu' type kind"); });
15bcc36eSThomas Raoux}
15bcc36eSThomas Raoux//===----------------------------------------------------------------------===//
15bcc36eSThomas Raoux// NVGPU_DeviceAsyncCopyOp
15bcc36eSThomas Raoux//===----------------------------------------------------------------------===//
15bcc36eSThomas Raoux
15bcc36eSThomas Raoux/// Return true if the last dimension of the MemRefType has unit stride. Also
15bcc36eSThomas Raoux/// return true for memrefs with no strides.
15bcc36eSThomas Raouxstatic bool isLastMemrefDimUnitStride(MemRefType type) {
15bcc36eSThomas Raoux  int64_t offset;
15bcc36eSThomas Raoux  SmallVector<int64_t> strides;
15bcc36eSThomas Raoux  if (failed(getStridesAndOffset(type, strides, offset))) {
15bcc36eSThomas Raoux    return false;
15bcc36eSThomas Raoux  }
15bcc36eSThomas Raoux  return strides.back() == 1;
15bcc36eSThomas Raoux}
15bcc36eSThomas Raoux
15bcc36eSThomas RaouxLogicalResult DeviceAsyncCopyOp::verify() {
8df54a6aSJacques Pienaar  auto srcMemref = getSrc().getType().cast<MemRefType>();
8df54a6aSJacques Pienaar  auto dstMemref = getDst().getType().cast<MemRefType>();
15bcc36eSThomas Raoux  unsigned workgroupAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
15bcc36eSThomas Raoux  if (!isLastMemrefDimUnitStride(srcMemref))
15bcc36eSThomas Raoux    return emitError("source memref most minor dim must have unit stride");
15bcc36eSThomas Raoux  if (!isLastMemrefDimUnitStride(dstMemref))
15bcc36eSThomas Raoux    return emitError("destination memref most minor dim must have unit stride");
15bcc36eSThomas Raoux  if (dstMemref.getMemorySpaceAsInt() != workgroupAddressSpace)
15bcc36eSThomas Raoux    return emitError("destination memref must have memory space ")
15bcc36eSThomas Raoux           << workgroupAddressSpace;
15bcc36eSThomas Raoux  if (dstMemref.getElementType() != srcMemref.getElementType())
15bcc36eSThomas Raoux    return emitError("source and destination must have the same element type");
8df54a6aSJacques Pienaar  if (size_t(srcMemref.getRank()) != getSrcIndices().size())
15bcc36eSThomas Raoux    return emitOpError() << "expected " << srcMemref.getRank()
8df54a6aSJacques Pienaar                         << " source indices, got " << getSrcIndices().size();
8df54a6aSJacques Pienaar  if (size_t(dstMemref.getRank()) != getDstIndices().size())
15bcc36eSThomas Raoux    return emitOpError() << "expected " << dstMemref.getRank()
8df54a6aSJacques Pienaar                         << " destination indices, got "
8df54a6aSJacques Pienaar                         << getDstIndices().size();
15bcc36eSThomas Raoux  return success();
15bcc36eSThomas Raoux}
15bcc36eSThomas Raoux
*713d3de5SManish Gupta//===----------------------------------------------------------------------===//
*713d3de5SManish Gupta// NVGPU_MmaSyncOp
*713d3de5SManish Gupta//===----------------------------------------------------------------------===//
*713d3de5SManish Gupta
f7d42d51SManish GuptaLogicalResult MmaSyncOp::verify() {
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // Fundamental tensor core mma.sync op
f7d42d51SManish Gupta  // For F32 (TF32), F16, S8, and S4 data types fundamental tensor core
f7d42d51SManish Gupta  // operation is of shape: 8-by-8-by-128b. F64 is an exception. The
f7d42d51SManish Gupta  // verification for mma.sync covering various shapes and data types is based
f7d42d51SManish Gupta  // on the fundamental tensor core operionation.
f7d42d51SManish Gupta  constexpr int kThreads = 32; // 32 threads per warp
f7d42d51SManish Gupta  int64_t shapeM = 8;
f7d42d51SManish Gupta  int64_t shapeN = 8;
f7d42d51SManish Gupta  int64_t shapeK; // set based on data type (128b for all data types except F64)
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // Number of elements A, B, and C per thread per fundamental tensor core tile
f7d42d51SManish Gupta  int64_t numElementA;    // set based on data type (32b except F64)
f7d42d51SManish Gupta  int64_t numElementB;    // set based on data type (32b except F64)
f7d42d51SManish Gupta  int64_t numElementC{2}; // two accumulator elements per fundamental tile
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // nvgpu.mma.sync vector operands (per thread)
f7d42d51SManish Gupta  auto aVector = getMatrixA().getType().cast<VectorType>();
f7d42d51SManish Gupta  auto bVector = getMatrixB().getType().cast<VectorType>();
f7d42d51SManish Gupta  auto cVector = getMatrixC().getType().cast<VectorType>();
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // vector shapes
f7d42d51SManish Gupta  ArrayRef<int64_t> aShape = aVector.getShape();
f7d42d51SManish Gupta  ArrayRef<int64_t> bShape = bVector.getShape();
f7d42d51SManish Gupta  ArrayRef<int64_t> cShape = cVector.getShape();
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // vector element type
f7d42d51SManish Gupta  Type aType = aVector.getElementType();
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // nvgpu.mma.sync shape (per 32 threads or per warp)
f7d42d51SManish Gupta  int64_t m = getMmaShape()[0].cast<IntegerAttr>().getInt();
f7d42d51SManish Gupta  int64_t n = getMmaShape()[1].cast<IntegerAttr>().getInt();
f7d42d51SManish Gupta  int64_t k = getMmaShape()[2].cast<IntegerAttr>().getInt();
f7d42d51SManish Gupta
f7d42d51SManish Gupta  if (aType.isF64()) {
f7d42d51SManish Gupta    // exception to 8-by-8-128b fundamental tensor core tile size
f7d42d51SManish Gupta    shapeK = 4;
f7d42d51SManish Gupta    numElementA = 1;
f7d42d51SManish Gupta    numElementB = 1;
f7d42d51SManish Gupta  } else if (aType.isF32() || aType.isBF16() || aType.isF16() ||
f7d42d51SManish Gupta             aType.isInteger(8) || aType.isInteger(4)) {
f7d42d51SManish Gupta    // 8-by-8-128b fundamental tensor core tile size
f7d42d51SManish Gupta    int operandBitwidth = aType.getIntOrFloatBitWidth();
f7d42d51SManish Gupta    shapeK = 128 / operandBitwidth;     // 128b wide shapeK
f7d42d51SManish Gupta    numElementA = 32 / operandBitwidth; // 32b wide operand A
f7d42d51SManish Gupta    numElementB = 32 / operandBitwidth; // 32b wide operand B
f7d42d51SManish Gupta  } else {
f7d42d51SManish Gupta    return emitError() << "expected input data type (i4,i8,f16,bf16,tf32,f64) "
f7d42d51SManish Gupta                          "supported by nvgpu.mma.sync";
f7d42d51SManish Gupta  }
f7d42d51SManish Gupta
f7d42d51SManish Gupta  //
f7d42d51SManish Gupta  // Basic verification
f7d42d51SManish Gupta  //
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // verify warp-wide size for vector a
f7d42d51SManish Gupta  if (aShape[0] * aShape[1] * kThreads != m * k)
f7d42d51SManish Gupta    return emitOpError() << "expected " << m * k
f7d42d51SManish Gupta                         << " warp-wide matrix A elements";
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // verify warp-wide size for vector b
f7d42d51SManish Gupta  if (bShape[0] * bShape[1] * kThreads != k * n)
f7d42d51SManish Gupta    return emitOpError() << "expected " << k * n
f7d42d51SManish Gupta                         << " warp-wide matrix B elements";
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // verify warp-wide size for vector c
f7d42d51SManish Gupta  if (cShape[0] * cShape[1] * kThreads != m * n)
f7d42d51SManish Gupta    return emitOpError() << "expected " << m * n
f7d42d51SManish Gupta                         << " warp-wide matrix C elements";
f7d42d51SManish Gupta
f7d42d51SManish Gupta  //
f7d42d51SManish Gupta  // Extended verification
f7d42d51SManish Gupta  //
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // tiles of fundamental tensor core operations
f7d42d51SManish Gupta  int64_t mTile = m / shapeM;
f7d42d51SManish Gupta  int64_t nTile = n / shapeN;
f7d42d51SManish Gupta  int64_t kTile = k / shapeK;
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // verify shape of aVector
f7d42d51SManish Gupta  if (!((aShape[0] == mTile * kTile) && (aShape[1] == numElementA)))
f7d42d51SManish Gupta    return emitOpError() << "expected matrix A to be shaped (" << mTile * kTile
f7d42d51SManish Gupta                         << " x " << numElementA << ")";
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // verify shape of bVector
f7d42d51SManish Gupta  if (!((bShape[0] == kTile * nTile) && (bShape[1] == numElementB)))
f7d42d51SManish Gupta    return emitOpError() << "expected matrix B to be shaped (" << kTile * nTile
f7d42d51SManish Gupta                         << " x " << numElementB << ")";
f7d42d51SManish Gupta
f7d42d51SManish Gupta  // verify shape of cVector
f7d42d51SManish Gupta  if (!((cShape[0] == mTile * nTile) && (cShape[1] == numElementC)))
f7d42d51SManish Gupta    return emitOpError() << "expected matrix C to be shaped (" << mTile * nTile
f7d42d51SManish Gupta                         << " x " << numElementC << ")";
f7d42d51SManish Gupta
f7d42d51SManish Gupta  return success();
f7d42d51SManish Gupta}
f7d42d51SManish Gupta
*713d3de5SManish Gupta//===----------------------------------------------------------------------===//
*713d3de5SManish Gupta// NVGPU_LdMatrixOp
*713d3de5SManish Gupta//===----------------------------------------------------------------------===//
*713d3de5SManish GuptaLogicalResult LdMatrixOp::verify() {
*713d3de5SManish Gupta
*713d3de5SManish Gupta  // ldmatrix reads data from source in shared memory
*713d3de5SManish Gupta  auto srcMemref = getSrcMemref().getType().cast<MemRefType>();
*713d3de5SManish Gupta
*713d3de5SManish Gupta  // ldmatrix writes data to result/destination in vector registers
*713d3de5SManish Gupta  auto resVector = getRes().getType().cast<VectorType>();
*713d3de5SManish Gupta
*713d3de5SManish Gupta  // vector register shape, element type, and bitwidth
*713d3de5SManish Gupta  ArrayRef<int64_t> resShape = resVector.getShape();
*713d3de5SManish Gupta  Type resType = resVector.getElementType();
*713d3de5SManish Gupta  int64_t elementBitWidth = resType.getIntOrFloatBitWidth();
*713d3de5SManish Gupta
*713d3de5SManish Gupta  // ldmatrix loads 32 bits into vector registers per 8-by-8 tile per thread
*713d3de5SManish Gupta  int64_t numElementsPer32b = 32 / elementBitWidth;
*713d3de5SManish Gupta
*713d3de5SManish Gupta  // number of 8-by-8 tiles
*713d3de5SManish Gupta  int64_t numTiles = getNumTiles();
*713d3de5SManish Gupta
*713d3de5SManish Gupta  // transpose elements in vector registers at 16b granularity when true
*713d3de5SManish Gupta  bool isTranspose = getTranspose();
*713d3de5SManish Gupta
*713d3de5SManish Gupta  // address space id for shared memory
*713d3de5SManish Gupta  unsigned smemAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
*713d3de5SManish Gupta
*713d3de5SManish Gupta  //
*713d3de5SManish Gupta  // verification
*713d3de5SManish Gupta  //
*713d3de5SManish Gupta
*713d3de5SManish Gupta  if (!(srcMemref.getMemorySpaceAsInt() == smemAddressSpace))
*713d3de5SManish Gupta    return emitError()
*713d3de5SManish Gupta           << "expected nvgpu.ldmatrix srcMemref must have memory space "
*713d3de5SManish Gupta           << smemAddressSpace;
*713d3de5SManish Gupta  if (elementBitWidth > 32)
*713d3de5SManish Gupta    return emitError() << "nvgpu.ldmatrix works for 32b or lower";
*713d3de5SManish Gupta  if (isTranspose && !(elementBitWidth == 16))
*713d3de5SManish Gupta    return emitError()
*713d3de5SManish Gupta           << "nvgpu.ldmatrix transpose works only at 16b granularity";
*713d3de5SManish Gupta  if (!(resShape[1] == numElementsPer32b))
*713d3de5SManish Gupta    return emitError() << "expected vector register shape[1] = "
*713d3de5SManish Gupta                       << numElementsPer32b;
*713d3de5SManish Gupta  if (!(resShape[0] == numTiles))
*713d3de5SManish Gupta    return emitError()
*713d3de5SManish Gupta           << "expected vector register shape[0] and numTiles to match";
*713d3de5SManish Gupta
*713d3de5SManish Gupta  return success();
*713d3de5SManish Gupta}
*713d3de5SManish Gupta
4c564940SThomas Raoux#define GET_OP_CLASSES
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"