108778d8cSAlex Zinenko //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
208778d8cSAlex Zinenko //
308778d8cSAlex Zinenko // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
408778d8cSAlex Zinenko // See https://llvm.org/LICENSE.txt for license information.
508778d8cSAlex Zinenko // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
608778d8cSAlex Zinenko //
708778d8cSAlex Zinenko //===----------------------------------------------------------------------===//
808778d8cSAlex Zinenko //
908778d8cSAlex Zinenko // This file implements utilities that allow one to create IR moving the data
1008778d8cSAlex Zinenko // across different levels of the GPU memory hierarchy.
1108778d8cSAlex Zinenko //
1208778d8cSAlex Zinenko //===----------------------------------------------------------------------===//
1308778d8cSAlex Zinenko 
1408778d8cSAlex Zinenko #include "mlir/Dialect/GPU/MemoryPromotion.h"
15*a54f4eaeSMogball #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
1608778d8cSAlex Zinenko #include "mlir/Dialect/GPU/GPUDialect.h"
178eb18a0fSNicolas Vasilache #include "mlir/Dialect/MemRef/IR/MemRef.h"
1884a880e1SNicolas Vasilache #include "mlir/Dialect/SCF/SCF.h"
198eb18a0fSNicolas Vasilache #include "mlir/Dialect/StandardOps/IR/Ops.h"
20e3cf7c88SNicolas Vasilache #include "mlir/IR/ImplicitLocOpBuilder.h"
2108778d8cSAlex Zinenko #include "mlir/Pass/Pass.h"
2208778d8cSAlex Zinenko #include "mlir/Transforms/LoopUtils.h"
2308778d8cSAlex Zinenko 
2408778d8cSAlex Zinenko using namespace mlir;
2508778d8cSAlex Zinenko using namespace mlir::gpu;
2608778d8cSAlex Zinenko 
2708778d8cSAlex Zinenko /// Returns the textual name of a GPU dimension.
2808778d8cSAlex Zinenko static StringRef getDimName(unsigned dim) {
2908778d8cSAlex Zinenko   if (dim == 0)
3008778d8cSAlex Zinenko     return "x";
3108778d8cSAlex Zinenko   if (dim == 1)
3208778d8cSAlex Zinenko     return "y";
3308778d8cSAlex Zinenko   if (dim == 2)
3408778d8cSAlex Zinenko     return "z";
3508778d8cSAlex Zinenko 
3608778d8cSAlex Zinenko   llvm_unreachable("dimension ID overflow");
3708778d8cSAlex Zinenko }
3808778d8cSAlex Zinenko 
3908778d8cSAlex Zinenko /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
4008778d8cSAlex Zinenko /// values using the bounds derived from the "from" value. Emits at least
4108778d8cSAlex Zinenko /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
4208778d8cSAlex Zinenko /// single-iteration loops. Maps the innermost loops to thread dimensions, in
4308778d8cSAlex Zinenko /// reverse order to enable access coalescing in the innermost loop.
44e3cf7c88SNicolas Vasilache static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
45e3cf7c88SNicolas Vasilache   auto memRefType = from.getType().cast<MemRefType>();
46e3cf7c88SNicolas Vasilache   auto rank = memRefType.getRank();
47e3cf7c88SNicolas Vasilache 
48367229e1SNicolas Vasilache   SmallVector<Value, 4> lbs, ubs, steps;
49*a54f4eaeSMogball   Value zero = b.create<arith::ConstantIndexOp>(0);
50*a54f4eaeSMogball   Value one = b.create<arith::ConstantIndexOp>(1);
5108778d8cSAlex Zinenko 
5208778d8cSAlex Zinenko   // Make sure we have enough loops to use all thread dimensions, these trivial
5308778d8cSAlex Zinenko   // loops should be outermost and therefore inserted first.
5408778d8cSAlex Zinenko   if (rank < GPUDialect::getNumWorkgroupDimensions()) {
5508778d8cSAlex Zinenko     unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
5608778d8cSAlex Zinenko     lbs.resize(extraLoops, zero);
5708778d8cSAlex Zinenko     ubs.resize(extraLoops, one);
5808778d8cSAlex Zinenko     steps.resize(extraLoops, one);
5908778d8cSAlex Zinenko   }
6008778d8cSAlex Zinenko 
6173f371c3SKazuaki Ishizaki   // Add existing bounds.
62e3cf7c88SNicolas Vasilache   lbs.append(rank, zero);
63e3cf7c88SNicolas Vasilache   ubs.reserve(lbs.size());
6408778d8cSAlex Zinenko   steps.reserve(lbs.size());
65e3cf7c88SNicolas Vasilache   for (auto idx = 0; idx < rank; ++idx) {
66*a54f4eaeSMogball     ubs.push_back(b.createOrFold<memref::DimOp>(
67*a54f4eaeSMogball         from, b.create<arith::ConstantIndexOp>(idx)));
68e3cf7c88SNicolas Vasilache     steps.push_back(one);
69e3cf7c88SNicolas Vasilache   }
7008778d8cSAlex Zinenko 
7108778d8cSAlex Zinenko   // Obtain thread identifiers and block sizes, necessary to map to them.
7284a880e1SNicolas Vasilache   auto indexType = b.getIndexType();
7308778d8cSAlex Zinenko   SmallVector<Value, 3> threadIds, blockDims;
7408778d8cSAlex Zinenko   for (unsigned i = 0; i < 3; ++i) {
7584a880e1SNicolas Vasilache     auto dimName = b.getStringAttr(getDimName(i));
76e3cf7c88SNicolas Vasilache     threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dimName));
77e3cf7c88SNicolas Vasilache     blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dimName));
7808778d8cSAlex Zinenko   }
7908778d8cSAlex Zinenko 
8008778d8cSAlex Zinenko   // Produce the loop nest with copies.
81367229e1SNicolas Vasilache   SmallVector<Value, 8> ivs(lbs.size());
8284a880e1SNicolas Vasilache   mlir::scf::buildLoopNest(
83e3cf7c88SNicolas Vasilache       b, b.getLoc(), lbs, ubs, steps,
8484a880e1SNicolas Vasilache       [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
85d1560f39SAlex Zinenko         ivs.assign(loopIvs.begin(), loopIvs.end());
8608778d8cSAlex Zinenko         auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
8784a880e1SNicolas Vasilache         Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
8884a880e1SNicolas Vasilache         b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
8908778d8cSAlex Zinenko       });
9008778d8cSAlex Zinenko 
9108778d8cSAlex Zinenko   // Map the innermost loops to threads in reverse order.
9208778d8cSAlex Zinenko   for (auto en :
9308778d8cSAlex Zinenko        llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
9408778d8cSAlex Zinenko            GPUDialect::getNumWorkgroupDimensions())))) {
95367229e1SNicolas Vasilache     Value v = en.value();
96c25b20c0SAlex Zinenko     auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
9708778d8cSAlex Zinenko     mapLoopToProcessorIds(loop, {threadIds[en.index()]},
9808778d8cSAlex Zinenko                           {blockDims[en.index()]});
9908778d8cSAlex Zinenko   }
10008778d8cSAlex Zinenko }
10108778d8cSAlex Zinenko 
10208778d8cSAlex Zinenko /// Emits the loop nests performing the copy to the designated location in the
10308778d8cSAlex Zinenko /// beginning of the region, and from the designated location immediately before
10408778d8cSAlex Zinenko /// the terminator of the first block of the region. The region is expected to
10508778d8cSAlex Zinenko /// have one block. This boils down to the following structure
10608778d8cSAlex Zinenko ///
10708778d8cSAlex Zinenko ///   ^bb(...):
10808778d8cSAlex Zinenko ///     <loop-bound-computation>
10908778d8cSAlex Zinenko ///     for %arg0 = ... to ... step ... {
11008778d8cSAlex Zinenko ///       ...
11108778d8cSAlex Zinenko ///         for %argN = <thread-id-x> to ... step <block-dim-x> {
11208778d8cSAlex Zinenko ///           %0 = load %from[%arg0, ..., %argN]
11308778d8cSAlex Zinenko ///           store %0, %to[%arg0, ..., %argN]
11408778d8cSAlex Zinenko ///         }
11508778d8cSAlex Zinenko ///       ...
11608778d8cSAlex Zinenko ///     }
11708778d8cSAlex Zinenko ///     gpu.barrier
11808778d8cSAlex Zinenko ///     <... original body ...>
11908778d8cSAlex Zinenko ///     gpu.barrier
12008778d8cSAlex Zinenko ///     for %arg0 = ... to ... step ... {
12108778d8cSAlex Zinenko ///       ...
12208778d8cSAlex Zinenko ///         for %argN = <thread-id-x> to ... step <block-dim-x> {
12308778d8cSAlex Zinenko ///           %1 = load %to[%arg0, ..., %argN]
12408778d8cSAlex Zinenko ///           store %1, %from[%arg0, ..., %argN]
12508778d8cSAlex Zinenko ///         }
12608778d8cSAlex Zinenko ///       ...
12708778d8cSAlex Zinenko ///     }
12808778d8cSAlex Zinenko ///
12908778d8cSAlex Zinenko /// Inserts the barriers unconditionally since different threads may be copying
13008778d8cSAlex Zinenko /// values and reading them. An analysis would be required to eliminate barriers
13108778d8cSAlex Zinenko /// in case where value is only used by the thread that copies it. Both copies
13208778d8cSAlex Zinenko /// are inserted unconditionally, an analysis would be required to only copy
13308778d8cSAlex Zinenko /// live-in and live-out values when necessary. This copies the entire memref
13408778d8cSAlex Zinenko /// pointed to by "from". In case a smaller block would be sufficient, the
13508778d8cSAlex Zinenko /// caller can create a subview of the memref and promote it instead.
13608778d8cSAlex Zinenko static void insertCopies(Region &region, Location loc, Value from, Value to) {
13708778d8cSAlex Zinenko   auto fromType = from.getType().cast<MemRefType>();
13808778d8cSAlex Zinenko   auto toType = to.getType().cast<MemRefType>();
13908778d8cSAlex Zinenko   (void)fromType;
14008778d8cSAlex Zinenko   (void)toType;
14108778d8cSAlex Zinenko   assert(fromType.getShape() == toType.getShape());
14208778d8cSAlex Zinenko   assert(fromType.getRank() != 0);
143204c3b55SRiver Riddle   assert(llvm::hasSingleElement(region) &&
14408778d8cSAlex Zinenko          "unstructured control flow not supported");
14508778d8cSAlex Zinenko 
146e3cf7c88SNicolas Vasilache   auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
147e3cf7c88SNicolas Vasilache   insertCopyLoops(b, from, to);
148e3cf7c88SNicolas Vasilache   b.create<gpu::BarrierOp>();
14908778d8cSAlex Zinenko 
15084a880e1SNicolas Vasilache   b.setInsertionPoint(&region.front().back());
151e3cf7c88SNicolas Vasilache   b.create<gpu::BarrierOp>();
152e3cf7c88SNicolas Vasilache   insertCopyLoops(b, to, from);
15308778d8cSAlex Zinenko }
15408778d8cSAlex Zinenko 
15508778d8cSAlex Zinenko /// Promotes a function argument to workgroup memory in the given function. The
15608778d8cSAlex Zinenko /// copies will be inserted in the beginning and in the end of the function.
15708778d8cSAlex Zinenko void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
15808778d8cSAlex Zinenko   Value value = op.getArgument(arg);
15908778d8cSAlex Zinenko   auto type = value.getType().dyn_cast<MemRefType>();
16008778d8cSAlex Zinenko   assert(type && type.hasStaticShape() && "can only promote memrefs");
16108778d8cSAlex Zinenko 
162ad398164SWen-Heng (Jack) Chung   // Get the type of the buffer in the workgroup memory.
163ad398164SWen-Heng (Jack) Chung   int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
164ad398164SWen-Heng (Jack) Chung   auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {},
165ad398164SWen-Heng (Jack) Chung                                     workgroupMemoryAddressSpace);
166ad398164SWen-Heng (Jack) Chung 
167ad398164SWen-Heng (Jack) Chung   Value attribution = op.addWorkgroupAttribution(bufferType);
16808778d8cSAlex Zinenko 
16908778d8cSAlex Zinenko   // Replace the uses first since only the original uses are currently present.
17008778d8cSAlex Zinenko   // Then insert the copies.
17108778d8cSAlex Zinenko   value.replaceAllUsesWith(attribution);
17208778d8cSAlex Zinenko   insertCopies(op.getBody(), op.getLoc(), value, attribution);
17308778d8cSAlex Zinenko }
174