Conversion/VectorToGPU/NvGpuSupport.cpp

1ca772edSChristopher Bate//===- NvGpuSupport.cpp - MLIR Vector to GPU lowering support --------===//
1ca772edSChristopher Bate//
1ca772edSChristopher Bate// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
1ca772edSChristopher Bate// See https://llvm.org/LICENSE.txt for license information.
1ca772edSChristopher Bate// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
1ca772edSChristopher Bate//
1ca772edSChristopher Bate//===----------------------------------------------------------------------===//
1ca772edSChristopher Bate//
1ca772edSChristopher Bate// This file provides utilities to assist in the lowering of Vector operations
1ca772edSChristopher Bate// to NvGPU dialect MMA operations.
1ca772edSChristopher Bate//
1ca772edSChristopher Bate//===----------------------------------------------------------------------===//
1ca772edSChristopher Bate
1ca772edSChristopher Bate#include "NvGpuSupport.h"
1ca772edSChristopher Bate#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
1ca772edSChristopher Bate#include "mlir/Dialect/Vector/IR/VectorOps.h"
1ca772edSChristopher Bate
1ca772edSChristopher Batenamespace mlir {
1ca772edSChristopher Batenamespace nvgpu {
1ca772edSChristopher Batenamespace {
1ca772edSChristopher Bate
1ca772edSChristopher Bate/// There are always 4 threads per [128|256|512] bit row.
1ca772edSChristopher Bateconstexpr int64_t kThreadsPerRow = 4;
1ca772edSChristopher Bate
1ca772edSChristopher Bateconstexpr int64_t kNumRowsPerTile = 8;
1ca772edSChristopher Bate
1ca772edSChristopher Batebool isAccumulatorOrResult(MatMulOperandRole operandType) {
1ca772edSChristopher Bate  return operandType == MatMulOperandRole::C;
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher Bate/// Returns the number of registers which compose a matrix fragment held by a
1ca772edSChristopher Bate/// single thread.
1ca772edSChristopher Bateint64_t inferNumRegistersPerMatrixFragment(const WarpMatrixInfo &type) {
1ca772edSChristopher Bate  int64_t lineSize = inferTileWidthInBits(type);
1ca772edSChristopher Bate  auto shape = type.vectorType.getShape();
1ca772edSChristopher Bate  return (shape[0] / kNumRowsPerTile) *
1ca772edSChristopher Bate         (shape[1] * type.vectorType.getElementType().getIntOrFloatBitWidth()) /
1ca772edSChristopher Bate         lineSize;
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher Bate/// Returns the number of 8 x [128|256|512] bit tiles that compose the given
1ca772edSChristopher Bate/// operand shape.
1ca772edSChristopher Batestd::array<int64_t, 2> getTileShape(ArrayRef<int64_t> operandShape,
1ca772edSChristopher Bate                                    Type elementType, int64_t lineSizeBits) {
1ca772edSChristopher Bate  // For each 8x128bit square, a thread is responsible for one 32bit register.
1ca772edSChristopher Bate  return {operandShape[0] / kNumRowsPerTile,
1ca772edSChristopher Bate          (operandShape[1] * elementType.getIntOrFloatBitWidth()) /
1ca772edSChristopher Bate              lineSizeBits};
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher Bate} // namespace
1ca772edSChristopher Bate
1ca772edSChristopher BateFailureOr<WarpMatrixInfo> getWarpMatrixInfo(Operation *op) {
1ca772edSChristopher Bate  WarpMatrixInfo info;
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // Determine the vector type.
1ca772edSChristopher Bate  if (vector::TransferWriteOp writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
1ca772edSChristopher Bate    info.vectorType = writeOp.getVectorType();
1ca772edSChristopher Bate  } else if (isa<vector::TransferReadOp, vector::ContractionOp,
1ca772edSChristopher Bate                 arith::ConstantOp>(op)) {
1ca772edSChristopher Bate    info.vectorType = op->getResult(0).getType().cast<VectorType>();
1ca772edSChristopher Bate  } else {
1ca772edSChristopher Bate    return op->emitError()
1ca772edSChristopher Bate           << "unhandled operation type in nvgpu.mma.sync conversion path";
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // Determine the operand role. We assume it is an accumulator/result unless it
1ca772edSChristopher Bate  // is directly consumed by a `vector.contract` op.
1ca772edSChristopher Bate  info.operandRole = MatMulOperandRole::C;
1ca772edSChristopher Bate  for (Operation *user : op->getUsers()) {
1ca772edSChristopher Bate    auto contract = dyn_cast<vector::ContractionOp>(user);
1ca772edSChristopher Bate    if (!contract)
1ca772edSChristopher Bate      continue;
1ca772edSChristopher Bate    if (contract.getLhs() == op->getResult(0)) {
1ca772edSChristopher Bate      info.operandRole = MatMulOperandRole::A;
1ca772edSChristopher Bate      break;
1ca772edSChristopher Bate    }
1ca772edSChristopher Bate    if (contract.getRhs() == op->getResult(0)) {
1ca772edSChristopher Bate      info.operandRole = MatMulOperandRole::B;
1ca772edSChristopher Bate      break;
1ca772edSChristopher Bate    }
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate  return info;
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher Bateint64_t inferTileWidthInBits(const WarpMatrixInfo &type) {
1ca772edSChristopher Bate  bool isAcc = isAccumulatorOrResult(type.operandRole);
1ca772edSChristopher Bate  Type elType = type.vectorType.getElementType();
1ca772edSChristopher Bate  if (isAcc && elType.getIntOrFloatBitWidth() == 32) {
1ca772edSChristopher Bate    return 256;
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate  if (elType.getIntOrFloatBitWidth() == 64) {
1ca772edSChristopher Bate    return isAcc ? 512 : 256;
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate  return 128;
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher BateFailureOr<FragmentElementInfo>
1ca772edSChristopher BategetMmaSyncRegisterType(const WarpMatrixInfo &type) {
1ca772edSChristopher Bate  MLIRContext *ctx = type.vectorType.getContext();
1ca772edSChristopher Bate  const bool isAccum = isAccumulatorOrResult(type.operandRole);
1ca772edSChristopher Bate
1ca772edSChristopher Bate  Type elType = type.vectorType.getElementType();
1ca772edSChristopher Bate  if (elType.isF16()) {
1ca772edSChristopher Bate    return FragmentElementInfo{
1ca772edSChristopher Bate        LLVM::getFixedVectorType(Float16Type::get(ctx), 2), 2, 32,
1ca772edSChristopher Bate        inferNumRegistersPerMatrixFragment(type)};
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // f64 operand
1ca772edSChristopher Bate  Type f64Ty = Float64Type::get(ctx);
1ca772edSChristopher Bate  if (elType.isF64()) {
1ca772edSChristopher Bate    return isAccum
1ca772edSChristopher Bate               ? FragmentElementInfo{LLVM::getFixedVectorType(f64Ty, 2), 2, 128,
1ca772edSChristopher Bate                                     inferNumRegistersPerMatrixFragment(type)}
1ca772edSChristopher Bate               : FragmentElementInfo{f64Ty, 1, 64,
1ca772edSChristopher Bate                                     inferNumRegistersPerMatrixFragment(type)};
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // int8 operand
1ca772edSChristopher Bate  if (elType.isInteger(8)) {
1ca772edSChristopher Bate    return FragmentElementInfo{
1ca772edSChristopher Bate        LLVM::getFixedVectorType(IntegerType::get(ctx, 8), 4), 4, 32,
1ca772edSChristopher Bate        inferNumRegistersPerMatrixFragment(type)};
1ca772edSChristopher Bate  }
670eee08SChristopher Bate
670eee08SChristopher Bate  // int4 operand
670eee08SChristopher Bate  if (elType.isInteger(4)) {
670eee08SChristopher Bate    return FragmentElementInfo{
670eee08SChristopher Bate        LLVM::getFixedVectorType(IntegerType::get(ctx, 4), 8), 8, 32,
670eee08SChristopher Bate        inferNumRegistersPerMatrixFragment(type)};
670eee08SChristopher Bate  }
670eee08SChristopher Bate
1ca772edSChristopher Bate  // Integer 32bit acc operands
1ca772edSChristopher Bate  if (elType.isInteger(32)) {
1ca772edSChristopher Bate    return FragmentElementInfo{
1ca772edSChristopher Bate        LLVM::getFixedVectorType(IntegerType::get(ctx, 32), 2), 2, 64,
1ca772edSChristopher Bate        inferNumRegistersPerMatrixFragment(type)};
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // Floating point 32bit operands
1ca772edSChristopher Bate  if (elType.isF32()) {
1ca772edSChristopher Bate    Type f32Ty = Float32Type::get(ctx);
1ca772edSChristopher Bate    return isAccum
1ca772edSChristopher Bate               ? FragmentElementInfo{LLVM::getFixedVectorType(f32Ty, 2), 2, 64,
1ca772edSChristopher Bate                                     inferNumRegistersPerMatrixFragment(type)}
1ca772edSChristopher Bate               : FragmentElementInfo{f32Ty, 1, 32,
1ca772edSChristopher Bate                                     inferNumRegistersPerMatrixFragment(type)};
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate  return failure();
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher Batestatic AffineMap getRegisterIndexToTileOffsetMap(int64_t lineSize,
1ca772edSChristopher Bate                                                 Type elementType,
1ca772edSChristopher Bate                                                 ArrayRef<int64_t> operandShape,
1ca772edSChristopher Bate                                                 bool isAccumulator,
1ca772edSChristopher Bate                                                 int64_t elementsPerRegister,
1ca772edSChristopher Bate                                                 AffineExpr logicalValueId) {
1ca772edSChristopher Bate  const int64_t elementsPerLine =
1ca772edSChristopher Bate      lineSize / elementType.getIntOrFloatBitWidth();
1ca772edSChristopher Bate  const std::array<int64_t, 2> num8x128bTiles =
1ca772edSChristopher Bate      getTileShape(operandShape, elementType, lineSize);
1ca772edSChristopher Bate  AffineExpr registerIdx = logicalValueId.floorDiv(elementsPerRegister);
1ca772edSChristopher Bate  return AffineMap::get(
1ca772edSChristopher Bate      2, 0,
1ca772edSChristopher Bate      {(registerIdx % num8x128bTiles[0]) * 8,
1ca772edSChristopher Bate       (registerIdx.floorDiv(num8x128bTiles[0])) * elementsPerLine},
1ca772edSChristopher Bate      elementType.getContext());
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher BateFailureOr<AffineMap>
1ca772edSChristopher BategetLaneIdAndValueIdToOperandCoord(Location loc, OpBuilder &builder,
1ca772edSChristopher Bate                                  const WarpMatrixInfo &fragmentType) {
1ca772edSChristopher Bate  Type elementType = fragmentType.vectorType.getElementType();
1ca772edSChristopher Bate  ArrayRef<int64_t> operandShape = fragmentType.vectorType.getShape();
1ca772edSChristopher Bate  FailureOr<nvgpu::FragmentElementInfo> regInfo =
1ca772edSChristopher Bate      getMmaSyncRegisterType(fragmentType);
1ca772edSChristopher Bate  if (failed(regInfo))
1ca772edSChristopher Bate    return failure();
1ca772edSChristopher Bate
1ca772edSChristopher Bate  const int64_t elementBitWidth = elementType.getIntOrFloatBitWidth();
1ca772edSChristopher Bate  const int64_t elementsPerRegister =
1ca772edSChristopher Bate      regInfo->registerWidthBits / elementBitWidth;
1ca772edSChristopher Bate  const int64_t lineSize = inferTileWidthInBits(fragmentType);
1ca772edSChristopher Bate
1ca772edSChristopher Bate  AffineExpr laneId, logicalValueIdDim;
1ca772edSChristopher Bate  bindDims(builder.getContext(), laneId, logicalValueIdDim);
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // Determine what register logicalValueId corresponds to. Use that as a
1ca772edSChristopher Bate  // linear index into the coordinate mapping `index -> (tile row, tile col)`.
1ca772edSChristopher Bate  AffineMap registerIndexToTileCoord = getRegisterIndexToTileOffsetMap(
1ca772edSChristopher Bate      lineSize, elementType, operandShape,
1ca772edSChristopher Bate      isAccumulatorOrResult(fragmentType.operandRole), elementsPerRegister,
1ca772edSChristopher Bate      logicalValueIdDim);
1ca772edSChristopher Bate
1ca772edSChristopher Bate  auto makeMap = [&](ArrayRef<AffineExpr> dimExprs) -> AffineMap {
1ca772edSChristopher Bate    return AffineMap::get(2, 0, dimExprs, builder.getContext());
1ca772edSChristopher Bate  };
1ca772edSChristopher Bate
1ca772edSChristopher Bate  auto tileRow = registerIndexToTileCoord.getResult(0);
1ca772edSChristopher Bate  auto tileCol = registerIndexToTileCoord.getResult(1);
1ca772edSChristopher Bate  return makeMap({tileRow + laneId.floorDiv(kThreadsPerRow),
1ca772edSChristopher Bate                  tileCol + (laneId % kThreadsPerRow) * elementsPerRegister +
1ca772edSChristopher Bate                      (logicalValueIdDim % elementsPerRegister)});
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher BateFailureOr<nvgpu::LdMatrixParams> getLdMatrixParams(const WarpMatrixInfo &type,
1ca772edSChristopher Bate                                                   bool transpose) {
1ca772edSChristopher Bate  LdMatrixParams params;
1ca772edSChristopher Bate  Type elType = type.vectorType.getElementType();
1ca772edSChristopher Bate  params.fragmentType = type.vectorType;
1ca772edSChristopher Bate  if (type.operandRole == MatMulOperandRole::A ||
1ca772edSChristopher Bate      type.operandRole == MatMulOperandRole::C) {
1ca772edSChristopher Bate    params.targetLayout = NVVM::MMALayout::row;
1ca772edSChristopher Bate  } else {
1ca772edSChristopher Bate    params.targetLayout = NVVM::MMALayout::col;
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate  ArrayRef<int64_t> shape = type.vectorType.getShape();
1ca772edSChristopher Bate  params.contiguousDimType =
1ca772edSChristopher Bate      transpose ? IteratorType::Parallel : IteratorType::Reduction;
1ca772edSChristopher Bate
670eee08SChristopher Bate  if (params.contiguousDimType == IteratorType::Reduction) {
1ca772edSChristopher Bate    params.numTiles = (shape[0] / kNumRowsPerTile) *
1ca772edSChristopher Bate                      ((shape[1] * elType.getIntOrFloatBitWidth()) / 128);
1ca772edSChristopher Bate  } else {
1ca772edSChristopher Bate    params.numTiles = (shape[1] / kNumRowsPerTile) *
1ca772edSChristopher Bate                      ((shape[0] * elType.getIntOrFloatBitWidth()) / 128);
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate
1ca772edSChristopher Bate  if (params.numTiles == 0)
1ca772edSChristopher Bate    return failure();
1ca772edSChristopher Bate
1ca772edSChristopher Bate  return params;
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher BateFailureOr<AffineMap>
1ca772edSChristopher BategetLaneIdToLdMatrixMatrixCoord(Location loc, OpBuilder &builder,
1ca772edSChristopher Bate                               const LdMatrixParams &params) {
1ca772edSChristopher Bate  // One thread per 128b row.
1ca772edSChristopher Bate  const int64_t kNumThreadsPerTile = kNumRowsPerTile;
1ca772edSChristopher Bate  const int bitsPerElement = static_cast<int>(
1ca772edSChristopher Bate      params.fragmentType.getElementType().getIntOrFloatBitWidth());
1ca772edSChristopher Bate  const int kElementsPer128b = (128 / bitsPerElement);
1ca772edSChristopher Bate  ArrayRef<int64_t> operandShape = params.fragmentType.getShape();
1ca772edSChristopher Bate  AffineExpr d0 = getAffineDimExpr(0, builder.getContext());
1ca772edSChristopher Bate
1ca772edSChristopher Bate  auto makeMap = [&](ArrayRef<AffineExpr> dimExprs) -> AffineMap {
1ca772edSChristopher Bate    return AffineMap::get(1, 0, dimExprs, builder.getContext());
1ca772edSChristopher Bate  };
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // This case corresponds to row-major A|C or col-major B operands.
1ca772edSChristopher Bate  if (params.contiguousDimType == IteratorType::Reduction) {
1ca772edSChristopher Bate    AffineExpr row = d0 % (operandShape[0]);
1ca772edSChristopher Bate    AffineExpr col = d0.floorDiv(operandShape[0]) * (kElementsPer128b);
1ca772edSChristopher Bate    return makeMap({row, col});
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // This case Corresponds to col-major A|C or row-major B operands. The
1ca772edSChristopher Bate  // operandShape given is already pre-transposed (e.g. 8x16 = KxN).
1ca772edSChristopher Bate  if (params.contiguousDimType == IteratorType::Parallel) {
1ca772edSChristopher Bate    const int64_t num8x128bCols = (operandShape[0] * bitsPerElement) / 128;
1ca772edSChristopher Bate    // Threads are assigned in groups of 8 first across columns, then to
1ca772edSChristopher Bate    // rows. This is transpose of what `ldmatrix` expects, but when
1ca772edSChristopher Bate    // `ldmatrix` gets the `.trans` qualifier, final the effect will be to
1ca772edSChristopher Bate    // transpose just the blocks.
1ca772edSChristopher Bate    auto groupIdx = d0.floorDiv(kNumThreadsPerTile);
1ca772edSChristopher Bate    auto tileCol = (groupIdx % num8x128bCols);
1ca772edSChristopher Bate    auto tileRow = groupIdx.floorDiv(num8x128bCols);
1ca772edSChristopher Bate    return makeMap({tileCol * kElementsPer128b,
1ca772edSChristopher Bate                    tileRow * kNumRowsPerTile + (d0 % kNumRowsPerTile)});
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate  return failure();
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher BateLogicalResult
1ca772edSChristopher BatePrepareContractToGPUMMASync::matchAndRewrite(vector::ContractionOp op,
1ca772edSChristopher Bate                                             PatternRewriter &rewriter) const {
1ca772edSChristopher Bate  Location loc = op.getLoc();
1ca772edSChristopher Bate  Value lhs = op.getLhs();
1ca772edSChristopher Bate  Value rhs = op.getRhs();
1ca772edSChristopher Bate  Value res = op.getAcc();
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // Set up the parallel/reduction structure in right form.
1ca772edSChristopher Bate  using MapList = ArrayRef<ArrayRef<AffineExpr>>;
1ca772edSChristopher Bate  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
1ca772edSChristopher Bate  AffineExpr m;
1ca772edSChristopher Bate  AffineExpr n;
1ca772edSChristopher Bate  AffineExpr k;
1ca772edSChristopher Bate  bindDims(rewriter.getContext(), m, n, k);
1ca772edSChristopher Bate  static constexpr std::array<int64_t, 2> perm = {1, 0};
1ca772edSChristopher Bate  auto iteratorTypes = op.getIteratorTypes().getValue();
*d2c0572bSJacques Pienaar  SmallVector<AffineMap, 4> maps = op.getIndexingMapsArray();
1ca772edSChristopher Bate  if (iteratorTypes.size() != 3)
1ca772edSChristopher Bate    return failure();
1ca772edSChristopher Bate  if (!(isParallelIterator(iteratorTypes[0]) &&
1ca772edSChristopher Bate        isParallelIterator(iteratorTypes[1]) &&
1ca772edSChristopher Bate        isReductionIterator(iteratorTypes[2])))
1ca772edSChristopher Bate    return failure();
1ca772edSChristopher Bate
1ca772edSChristopher Bate  // The canonical form is "TNT" = A row-major, B col-major, C row-major.
1ca772edSChristopher Bate  const auto canonicalForm = infer({{m, k}, {n, k}, {m, n}});
1ca772edSChristopher Bate  if (maps == canonicalForm) {
1ca772edSChristopher Bate    return failure();
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate  if (maps == infer({{m, k}, {k, n}, {m, n}})) {
1ca772edSChristopher Bate    rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
1ca772edSChristopher Bate  } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
1ca772edSChristopher Bate    lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
1ca772edSChristopher Bate  } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
1ca772edSChristopher Bate    rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
1ca772edSChristopher Bate    lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
1ca772edSChristopher Bate  } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
1ca772edSChristopher Bate    std::swap(rhs, lhs);
1ca772edSChristopher Bate    rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
1ca772edSChristopher Bate    lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
1ca772edSChristopher Bate  } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
1ca772edSChristopher Bate    std::swap(rhs, lhs);
1ca772edSChristopher Bate    rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
1ca772edSChristopher Bate  } else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
1ca772edSChristopher Bate    std::swap(lhs, rhs);
1ca772edSChristopher Bate    lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
1ca772edSChristopher Bate  } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
1ca772edSChristopher Bate    std::swap(lhs, rhs);
1ca772edSChristopher Bate  } else {
1ca772edSChristopher Bate    return failure();
1ca772edSChristopher Bate  }
1ca772edSChristopher Bate  rewriter.replaceOpWithNewOp<vector::ContractionOp>(
1ca772edSChristopher Bate      op, lhs, rhs, res, rewriter.getAffineMapArrayAttr(canonicalForm),
1ca772edSChristopher Bate      op.getIteratorTypes());
1ca772edSChristopher Bate  return success();
1ca772edSChristopher Bate}
1ca772edSChristopher Bate
1ca772edSChristopher Bate} // namespace nvgpu
1ca772edSChristopher Bate} // namespace mlir