GPU/Transforms/MemoryPromotion.cpp

08778d8cSAlex Zinenko//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
08778d8cSAlex Zinenko//
08778d8cSAlex Zinenko// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
08778d8cSAlex Zinenko// See https://llvm.org/LICENSE.txt for license information.
08778d8cSAlex Zinenko// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
08778d8cSAlex Zinenko//
08778d8cSAlex Zinenko//===----------------------------------------------------------------------===//
08778d8cSAlex Zinenko//
08778d8cSAlex Zinenko// This file implements utilities that allow one to create IR moving the data
08778d8cSAlex Zinenko// across different levels of the GPU memory hierarchy.
08778d8cSAlex Zinenko//
08778d8cSAlex Zinenko//===----------------------------------------------------------------------===//
08778d8cSAlex Zinenko
*d7ef488bSMogball#include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h"
*d7ef488bSMogball
a70aa7bbSRiver Riddle#include "mlir/Dialect/Affine/LoopUtils.h"
a54f4eaeSMogball#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
*d7ef488bSMogball#include "mlir/Dialect/GPU/IR/GPUDialect.h"
8eb18a0fSNicolas Vasilache#include "mlir/Dialect/MemRef/IR/MemRef.h"
84a880e1SNicolas Vasilache#include "mlir/Dialect/SCF/SCF.h"
e3cf7c88SNicolas Vasilache#include "mlir/IR/ImplicitLocOpBuilder.h"
08778d8cSAlex Zinenko#include "mlir/Pass/Pass.h"
08778d8cSAlex Zinenko
08778d8cSAlex Zinenkousing namespace mlir;
08778d8cSAlex Zinenkousing namespace mlir::gpu;
08778d8cSAlex Zinenko
08778d8cSAlex Zinenko/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
08778d8cSAlex Zinenko/// values using the bounds derived from the "from" value. Emits at least
08778d8cSAlex Zinenko/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
08778d8cSAlex Zinenko/// single-iteration loops. Maps the innermost loops to thread dimensions, in
08778d8cSAlex Zinenko/// reverse order to enable access coalescing in the innermost loop.
e3cf7c88SNicolas Vasilachestatic void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
e3cf7c88SNicolas Vasilache  auto memRefType = from.getType().cast<MemRefType>();
e3cf7c88SNicolas Vasilache  auto rank = memRefType.getRank();
e3cf7c88SNicolas Vasilache
367229e1SNicolas Vasilache  SmallVector<Value, 4> lbs, ubs, steps;
a54f4eaeSMogball  Value zero = b.create<arith::ConstantIndexOp>(0);
a54f4eaeSMogball  Value one = b.create<arith::ConstantIndexOp>(1);
08778d8cSAlex Zinenko
08778d8cSAlex Zinenko  // Make sure we have enough loops to use all thread dimensions, these trivial
08778d8cSAlex Zinenko  // loops should be outermost and therefore inserted first.
08778d8cSAlex Zinenko  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
08778d8cSAlex Zinenko    unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
08778d8cSAlex Zinenko    lbs.resize(extraLoops, zero);
08778d8cSAlex Zinenko    ubs.resize(extraLoops, one);
08778d8cSAlex Zinenko    steps.resize(extraLoops, one);
08778d8cSAlex Zinenko  }
08778d8cSAlex Zinenko
73f371c3SKazuaki Ishizaki  // Add existing bounds.
e3cf7c88SNicolas Vasilache  lbs.append(rank, zero);
e3cf7c88SNicolas Vasilache  ubs.reserve(lbs.size());
08778d8cSAlex Zinenko  steps.reserve(lbs.size());
e3cf7c88SNicolas Vasilache  for (auto idx = 0; idx < rank; ++idx) {
a54f4eaeSMogball    ubs.push_back(b.createOrFold<memref::DimOp>(
a54f4eaeSMogball        from, b.create<arith::ConstantIndexOp>(idx)));
e3cf7c88SNicolas Vasilache    steps.push_back(one);
e3cf7c88SNicolas Vasilache  }
08778d8cSAlex Zinenko
08778d8cSAlex Zinenko  // Obtain thread identifiers and block sizes, necessary to map to them.
84a880e1SNicolas Vasilache  auto indexType = b.getIndexType();
08778d8cSAlex Zinenko  SmallVector<Value, 3> threadIds, blockDims;
aae51255SMogball  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
aae51255SMogball    threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
aae51255SMogball    blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
08778d8cSAlex Zinenko  }
08778d8cSAlex Zinenko
08778d8cSAlex Zinenko  // Produce the loop nest with copies.
367229e1SNicolas Vasilache  SmallVector<Value, 8> ivs(lbs.size());
84a880e1SNicolas Vasilache  mlir::scf::buildLoopNest(
e3cf7c88SNicolas Vasilache      b, b.getLoc(), lbs, ubs, steps,
84a880e1SNicolas Vasilache      [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
d1560f39SAlex Zinenko        ivs.assign(loopIvs.begin(), loopIvs.end());
08778d8cSAlex Zinenko        auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
84a880e1SNicolas Vasilache        Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
84a880e1SNicolas Vasilache        b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
08778d8cSAlex Zinenko      });
08778d8cSAlex Zinenko
08778d8cSAlex Zinenko  // Map the innermost loops to threads in reverse order.
e4853be2SMehdi Amini  for (const auto &en :
08778d8cSAlex Zinenko       llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
08778d8cSAlex Zinenko           GPUDialect::getNumWorkgroupDimensions())))) {
367229e1SNicolas Vasilache    Value v = en.value();
c25b20c0SAlex Zinenko    auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
08778d8cSAlex Zinenko    mapLoopToProcessorIds(loop, {threadIds[en.index()]},
08778d8cSAlex Zinenko                          {blockDims[en.index()]});
08778d8cSAlex Zinenko  }
08778d8cSAlex Zinenko}
08778d8cSAlex Zinenko
08778d8cSAlex Zinenko/// Emits the loop nests performing the copy to the designated location in the
08778d8cSAlex Zinenko/// beginning of the region, and from the designated location immediately before
08778d8cSAlex Zinenko/// the terminator of the first block of the region. The region is expected to
08778d8cSAlex Zinenko/// have one block. This boils down to the following structure
08778d8cSAlex Zinenko///
08778d8cSAlex Zinenko///   ^bb(...):
08778d8cSAlex Zinenko///     <loop-bound-computation>
08778d8cSAlex Zinenko///     for %arg0 = ... to ... step ... {
08778d8cSAlex Zinenko///       ...
08778d8cSAlex Zinenko///         for %argN = <thread-id-x> to ... step <block-dim-x> {
08778d8cSAlex Zinenko///           %0 = load %from[%arg0, ..., %argN]
08778d8cSAlex Zinenko///           store %0, %to[%arg0, ..., %argN]
08778d8cSAlex Zinenko///         }
08778d8cSAlex Zinenko///       ...
08778d8cSAlex Zinenko///     }
08778d8cSAlex Zinenko///     gpu.barrier
08778d8cSAlex Zinenko///     <... original body ...>
08778d8cSAlex Zinenko///     gpu.barrier
08778d8cSAlex Zinenko///     for %arg0 = ... to ... step ... {
08778d8cSAlex Zinenko///       ...
08778d8cSAlex Zinenko///         for %argN = <thread-id-x> to ... step <block-dim-x> {
08778d8cSAlex Zinenko///           %1 = load %to[%arg0, ..., %argN]
08778d8cSAlex Zinenko///           store %1, %from[%arg0, ..., %argN]
08778d8cSAlex Zinenko///         }
08778d8cSAlex Zinenko///       ...
08778d8cSAlex Zinenko///     }
08778d8cSAlex Zinenko///
08778d8cSAlex Zinenko/// Inserts the barriers unconditionally since different threads may be copying
08778d8cSAlex Zinenko/// values and reading them. An analysis would be required to eliminate barriers
08778d8cSAlex Zinenko/// in case where value is only used by the thread that copies it. Both copies
08778d8cSAlex Zinenko/// are inserted unconditionally, an analysis would be required to only copy
08778d8cSAlex Zinenko/// live-in and live-out values when necessary. This copies the entire memref
08778d8cSAlex Zinenko/// pointed to by "from". In case a smaller block would be sufficient, the
08778d8cSAlex Zinenko/// caller can create a subview of the memref and promote it instead.
08778d8cSAlex Zinenkostatic void insertCopies(Region &region, Location loc, Value from, Value to) {
08778d8cSAlex Zinenko  auto fromType = from.getType().cast<MemRefType>();
08778d8cSAlex Zinenko  auto toType = to.getType().cast<MemRefType>();
08778d8cSAlex Zinenko  (void)fromType;
08778d8cSAlex Zinenko  (void)toType;
08778d8cSAlex Zinenko  assert(fromType.getShape() == toType.getShape());
08778d8cSAlex Zinenko  assert(fromType.getRank() != 0);
204c3b55SRiver Riddle  assert(llvm::hasSingleElement(region) &&
08778d8cSAlex Zinenko         "unstructured control flow not supported");
08778d8cSAlex Zinenko
e3cf7c88SNicolas Vasilache  auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
e3cf7c88SNicolas Vasilache  insertCopyLoops(b, from, to);
e3cf7c88SNicolas Vasilache  b.create<gpu::BarrierOp>();
08778d8cSAlex Zinenko
84a880e1SNicolas Vasilache  b.setInsertionPoint(&region.front().back());
e3cf7c88SNicolas Vasilache  b.create<gpu::BarrierOp>();
e3cf7c88SNicolas Vasilache  insertCopyLoops(b, to, from);
08778d8cSAlex Zinenko}
08778d8cSAlex Zinenko
08778d8cSAlex Zinenko/// Promotes a function argument to workgroup memory in the given function. The
08778d8cSAlex Zinenko/// copies will be inserted in the beginning and in the end of the function.
08778d8cSAlex Zinenkovoid mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
08778d8cSAlex Zinenko  Value value = op.getArgument(arg);
08778d8cSAlex Zinenko  auto type = value.getType().dyn_cast<MemRefType>();
08778d8cSAlex Zinenko  assert(type && type.hasStaticShape() && "can only promote memrefs");
08778d8cSAlex Zinenko
ad398164SWen-Heng (Jack) Chung  // Get the type of the buffer in the workgroup memory.
ad398164SWen-Heng (Jack) Chung  int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
ad398164SWen-Heng (Jack) Chung  auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {},
ad398164SWen-Heng (Jack) Chung                                    workgroupMemoryAddressSpace);
e084679fSRiver Riddle  Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
08778d8cSAlex Zinenko
08778d8cSAlex Zinenko  // Replace the uses first since only the original uses are currently present.
08778d8cSAlex Zinenko  // Then insert the copies.
08778d8cSAlex Zinenko  value.replaceAllUsesWith(attribution);
08778d8cSAlex Zinenko  insertCopies(op.getBody(), op.getLoc(), value, attribution);
08778d8cSAlex Zinenko}