108778d8cSAlex Zinenko //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// 208778d8cSAlex Zinenko // 308778d8cSAlex Zinenko // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 408778d8cSAlex Zinenko // See https://llvm.org/LICENSE.txt for license information. 508778d8cSAlex Zinenko // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 608778d8cSAlex Zinenko // 708778d8cSAlex Zinenko //===----------------------------------------------------------------------===// 808778d8cSAlex Zinenko // 908778d8cSAlex Zinenko // This file implements utilities that allow one to create IR moving the data 1008778d8cSAlex Zinenko // across different levels of the GPU memory hierarchy. 1108778d8cSAlex Zinenko // 1208778d8cSAlex Zinenko //===----------------------------------------------------------------------===// 1308778d8cSAlex Zinenko 1408778d8cSAlex Zinenko #include "mlir/Dialect/GPU/MemoryPromotion.h" 15*a54f4eaeSMogball #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 1608778d8cSAlex Zinenko #include "mlir/Dialect/GPU/GPUDialect.h" 178eb18a0fSNicolas Vasilache #include "mlir/Dialect/MemRef/IR/MemRef.h" 1884a880e1SNicolas Vasilache #include "mlir/Dialect/SCF/SCF.h" 198eb18a0fSNicolas Vasilache #include "mlir/Dialect/StandardOps/IR/Ops.h" 20e3cf7c88SNicolas Vasilache #include "mlir/IR/ImplicitLocOpBuilder.h" 2108778d8cSAlex Zinenko #include "mlir/Pass/Pass.h" 2208778d8cSAlex Zinenko #include "mlir/Transforms/LoopUtils.h" 2308778d8cSAlex Zinenko 2408778d8cSAlex Zinenko using namespace mlir; 2508778d8cSAlex Zinenko using namespace mlir::gpu; 2608778d8cSAlex Zinenko 2708778d8cSAlex Zinenko /// Returns the textual name of a GPU dimension. 2808778d8cSAlex Zinenko static StringRef getDimName(unsigned dim) { 2908778d8cSAlex Zinenko if (dim == 0) 3008778d8cSAlex Zinenko return "x"; 3108778d8cSAlex Zinenko if (dim == 1) 3208778d8cSAlex Zinenko return "y"; 3308778d8cSAlex Zinenko if (dim == 2) 3408778d8cSAlex Zinenko return "z"; 3508778d8cSAlex Zinenko 3608778d8cSAlex Zinenko llvm_unreachable("dimension ID overflow"); 3708778d8cSAlex Zinenko } 3808778d8cSAlex Zinenko 3908778d8cSAlex Zinenko /// Emits the (imperfect) loop nest performing the copy between "from" and "to" 4008778d8cSAlex Zinenko /// values using the bounds derived from the "from" value. Emits at least 4108778d8cSAlex Zinenko /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with 4208778d8cSAlex Zinenko /// single-iteration loops. Maps the innermost loops to thread dimensions, in 4308778d8cSAlex Zinenko /// reverse order to enable access coalescing in the innermost loop. 44e3cf7c88SNicolas Vasilache static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) { 45e3cf7c88SNicolas Vasilache auto memRefType = from.getType().cast<MemRefType>(); 46e3cf7c88SNicolas Vasilache auto rank = memRefType.getRank(); 47e3cf7c88SNicolas Vasilache 48367229e1SNicolas Vasilache SmallVector<Value, 4> lbs, ubs, steps; 49*a54f4eaeSMogball Value zero = b.create<arith::ConstantIndexOp>(0); 50*a54f4eaeSMogball Value one = b.create<arith::ConstantIndexOp>(1); 5108778d8cSAlex Zinenko 5208778d8cSAlex Zinenko // Make sure we have enough loops to use all thread dimensions, these trivial 5308778d8cSAlex Zinenko // loops should be outermost and therefore inserted first. 5408778d8cSAlex Zinenko if (rank < GPUDialect::getNumWorkgroupDimensions()) { 5508778d8cSAlex Zinenko unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank; 5608778d8cSAlex Zinenko lbs.resize(extraLoops, zero); 5708778d8cSAlex Zinenko ubs.resize(extraLoops, one); 5808778d8cSAlex Zinenko steps.resize(extraLoops, one); 5908778d8cSAlex Zinenko } 6008778d8cSAlex Zinenko 6173f371c3SKazuaki Ishizaki // Add existing bounds. 62e3cf7c88SNicolas Vasilache lbs.append(rank, zero); 63e3cf7c88SNicolas Vasilache ubs.reserve(lbs.size()); 6408778d8cSAlex Zinenko steps.reserve(lbs.size()); 65e3cf7c88SNicolas Vasilache for (auto idx = 0; idx < rank; ++idx) { 66*a54f4eaeSMogball ubs.push_back(b.createOrFold<memref::DimOp>( 67*a54f4eaeSMogball from, b.create<arith::ConstantIndexOp>(idx))); 68e3cf7c88SNicolas Vasilache steps.push_back(one); 69e3cf7c88SNicolas Vasilache } 7008778d8cSAlex Zinenko 7108778d8cSAlex Zinenko // Obtain thread identifiers and block sizes, necessary to map to them. 7284a880e1SNicolas Vasilache auto indexType = b.getIndexType(); 7308778d8cSAlex Zinenko SmallVector<Value, 3> threadIds, blockDims; 7408778d8cSAlex Zinenko for (unsigned i = 0; i < 3; ++i) { 7584a880e1SNicolas Vasilache auto dimName = b.getStringAttr(getDimName(i)); 76e3cf7c88SNicolas Vasilache threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dimName)); 77e3cf7c88SNicolas Vasilache blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dimName)); 7808778d8cSAlex Zinenko } 7908778d8cSAlex Zinenko 8008778d8cSAlex Zinenko // Produce the loop nest with copies. 81367229e1SNicolas Vasilache SmallVector<Value, 8> ivs(lbs.size()); 8284a880e1SNicolas Vasilache mlir::scf::buildLoopNest( 83e3cf7c88SNicolas Vasilache b, b.getLoc(), lbs, ubs, steps, 8484a880e1SNicolas Vasilache [&](OpBuilder &b, Location loc, ValueRange loopIvs) { 85d1560f39SAlex Zinenko ivs.assign(loopIvs.begin(), loopIvs.end()); 8608778d8cSAlex Zinenko auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank); 8784a880e1SNicolas Vasilache Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs); 8884a880e1SNicolas Vasilache b.create<memref::StoreOp>(loc, loaded, to, activeIvs); 8908778d8cSAlex Zinenko }); 9008778d8cSAlex Zinenko 9108778d8cSAlex Zinenko // Map the innermost loops to threads in reverse order. 9208778d8cSAlex Zinenko for (auto en : 9308778d8cSAlex Zinenko llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back( 9408778d8cSAlex Zinenko GPUDialect::getNumWorkgroupDimensions())))) { 95367229e1SNicolas Vasilache Value v = en.value(); 96c25b20c0SAlex Zinenko auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp()); 9708778d8cSAlex Zinenko mapLoopToProcessorIds(loop, {threadIds[en.index()]}, 9808778d8cSAlex Zinenko {blockDims[en.index()]}); 9908778d8cSAlex Zinenko } 10008778d8cSAlex Zinenko } 10108778d8cSAlex Zinenko 10208778d8cSAlex Zinenko /// Emits the loop nests performing the copy to the designated location in the 10308778d8cSAlex Zinenko /// beginning of the region, and from the designated location immediately before 10408778d8cSAlex Zinenko /// the terminator of the first block of the region. The region is expected to 10508778d8cSAlex Zinenko /// have one block. This boils down to the following structure 10608778d8cSAlex Zinenko /// 10708778d8cSAlex Zinenko /// ^bb(...): 10808778d8cSAlex Zinenko /// <loop-bound-computation> 10908778d8cSAlex Zinenko /// for %arg0 = ... to ... step ... { 11008778d8cSAlex Zinenko /// ... 11108778d8cSAlex Zinenko /// for %argN = <thread-id-x> to ... step <block-dim-x> { 11208778d8cSAlex Zinenko /// %0 = load %from[%arg0, ..., %argN] 11308778d8cSAlex Zinenko /// store %0, %to[%arg0, ..., %argN] 11408778d8cSAlex Zinenko /// } 11508778d8cSAlex Zinenko /// ... 11608778d8cSAlex Zinenko /// } 11708778d8cSAlex Zinenko /// gpu.barrier 11808778d8cSAlex Zinenko /// <... original body ...> 11908778d8cSAlex Zinenko /// gpu.barrier 12008778d8cSAlex Zinenko /// for %arg0 = ... to ... step ... { 12108778d8cSAlex Zinenko /// ... 12208778d8cSAlex Zinenko /// for %argN = <thread-id-x> to ... step <block-dim-x> { 12308778d8cSAlex Zinenko /// %1 = load %to[%arg0, ..., %argN] 12408778d8cSAlex Zinenko /// store %1, %from[%arg0, ..., %argN] 12508778d8cSAlex Zinenko /// } 12608778d8cSAlex Zinenko /// ... 12708778d8cSAlex Zinenko /// } 12808778d8cSAlex Zinenko /// 12908778d8cSAlex Zinenko /// Inserts the barriers unconditionally since different threads may be copying 13008778d8cSAlex Zinenko /// values and reading them. An analysis would be required to eliminate barriers 13108778d8cSAlex Zinenko /// in case where value is only used by the thread that copies it. Both copies 13208778d8cSAlex Zinenko /// are inserted unconditionally, an analysis would be required to only copy 13308778d8cSAlex Zinenko /// live-in and live-out values when necessary. This copies the entire memref 13408778d8cSAlex Zinenko /// pointed to by "from". In case a smaller block would be sufficient, the 13508778d8cSAlex Zinenko /// caller can create a subview of the memref and promote it instead. 13608778d8cSAlex Zinenko static void insertCopies(Region ®ion, Location loc, Value from, Value to) { 13708778d8cSAlex Zinenko auto fromType = from.getType().cast<MemRefType>(); 13808778d8cSAlex Zinenko auto toType = to.getType().cast<MemRefType>(); 13908778d8cSAlex Zinenko (void)fromType; 14008778d8cSAlex Zinenko (void)toType; 14108778d8cSAlex Zinenko assert(fromType.getShape() == toType.getShape()); 14208778d8cSAlex Zinenko assert(fromType.getRank() != 0); 143204c3b55SRiver Riddle assert(llvm::hasSingleElement(region) && 14408778d8cSAlex Zinenko "unstructured control flow not supported"); 14508778d8cSAlex Zinenko 146e3cf7c88SNicolas Vasilache auto b = ImplicitLocOpBuilder::atBlockBegin(loc, ®ion.front()); 147e3cf7c88SNicolas Vasilache insertCopyLoops(b, from, to); 148e3cf7c88SNicolas Vasilache b.create<gpu::BarrierOp>(); 14908778d8cSAlex Zinenko 15084a880e1SNicolas Vasilache b.setInsertionPoint(®ion.front().back()); 151e3cf7c88SNicolas Vasilache b.create<gpu::BarrierOp>(); 152e3cf7c88SNicolas Vasilache insertCopyLoops(b, to, from); 15308778d8cSAlex Zinenko } 15408778d8cSAlex Zinenko 15508778d8cSAlex Zinenko /// Promotes a function argument to workgroup memory in the given function. The 15608778d8cSAlex Zinenko /// copies will be inserted in the beginning and in the end of the function. 15708778d8cSAlex Zinenko void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { 15808778d8cSAlex Zinenko Value value = op.getArgument(arg); 15908778d8cSAlex Zinenko auto type = value.getType().dyn_cast<MemRefType>(); 16008778d8cSAlex Zinenko assert(type && type.hasStaticShape() && "can only promote memrefs"); 16108778d8cSAlex Zinenko 162ad398164SWen-Heng (Jack) Chung // Get the type of the buffer in the workgroup memory. 163ad398164SWen-Heng (Jack) Chung int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace(); 164ad398164SWen-Heng (Jack) Chung auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {}, 165ad398164SWen-Heng (Jack) Chung workgroupMemoryAddressSpace); 166ad398164SWen-Heng (Jack) Chung 167ad398164SWen-Heng (Jack) Chung Value attribution = op.addWorkgroupAttribution(bufferType); 16808778d8cSAlex Zinenko 16908778d8cSAlex Zinenko // Replace the uses first since only the original uses are currently present. 17008778d8cSAlex Zinenko // Then insert the copies. 17108778d8cSAlex Zinenko value.replaceAllUsesWith(attribution); 17208778d8cSAlex Zinenko insertCopies(op.getBody(), op.getLoc(), value, attribution); 17308778d8cSAlex Zinenko } 174