14ead2cf7SAlex Zinenko //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 24ead2cf7SAlex Zinenko // 34ead2cf7SAlex Zinenko // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 44ead2cf7SAlex Zinenko // See https://llvm.org/LICENSE.txt for license information. 54ead2cf7SAlex Zinenko // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 64ead2cf7SAlex Zinenko // 74ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===// 84ead2cf7SAlex Zinenko // 94ead2cf7SAlex Zinenko // This implements a straightforward conversion of an loop nest into a GPU 104ead2cf7SAlex Zinenko // kernel. The caller is expected to guarantee that the conversion is correct 114ead2cf7SAlex Zinenko // or to further transform the kernel to ensure correctness. 124ead2cf7SAlex Zinenko // 134ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===// 144ead2cf7SAlex Zinenko 154ead2cf7SAlex Zinenko #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 164ead2cf7SAlex Zinenko 174ead2cf7SAlex Zinenko #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 184ead2cf7SAlex Zinenko #include "mlir/Dialect/Affine/IR/AffineOps.h" 19a54f4eaeSMogball #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 20*d7ef488bSMogball #include "mlir/Dialect/GPU/IR/GPUDialect.h" 21*d7ef488bSMogball #include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h" 22e2310704SJulian Gross #include "mlir/Dialect/MemRef/IR/MemRef.h" 234ead2cf7SAlex Zinenko #include "mlir/Dialect/SCF/SCF.h" 244ead2cf7SAlex Zinenko #include "mlir/IR/AffineExpr.h" 254ead2cf7SAlex Zinenko #include "mlir/IR/BlockAndValueMapping.h" 264ead2cf7SAlex Zinenko #include "mlir/IR/Builders.h" 274ead2cf7SAlex Zinenko #include "mlir/Pass/Pass.h" 284ead2cf7SAlex Zinenko #include "mlir/Transforms/DialectConversion.h" 294ead2cf7SAlex Zinenko #include "mlir/Transforms/Passes.h" 304ead2cf7SAlex Zinenko #include "mlir/Transforms/RegionUtils.h" 314ead2cf7SAlex Zinenko #include "llvm/ADT/Sequence.h" 324ead2cf7SAlex Zinenko #include "llvm/Support/Debug.h" 334ead2cf7SAlex Zinenko 344ead2cf7SAlex Zinenko #define DEBUG_TYPE "loops-to-gpu" 354ead2cf7SAlex Zinenko 364ead2cf7SAlex Zinenko using namespace mlir; 374ead2cf7SAlex Zinenko using namespace mlir::scf; 384ead2cf7SAlex Zinenko 39ec03bbe8SVladislav Vinogradov // Name of internal attribute to mark visited operations during conversion. 40ec03bbe8SVladislav Vinogradov // 41ec03bbe8SVladislav Vinogradov // NOTE: The conversion originally used the following legality criteria: 42ec03bbe8SVladislav Vinogradov // `!parallelOp->hasAttr(gpu::getMappingAttrName())` 43ec03bbe8SVladislav Vinogradov // But the provided pattern might reject some cases based on more detailed 44ec03bbe8SVladislav Vinogradov // analysis of the `mapping` attribute. 45ec03bbe8SVladislav Vinogradov // To avoid dialect conversion failure due to non-converted illegal operation 46ec03bbe8SVladislav Vinogradov // we use this extra Unit attribute as a marker, that the operation was checked 47ec03bbe8SVladislav Vinogradov // by the pattern and is should be considered as legal in the following legality 48ec03bbe8SVladislav Vinogradov // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up 49ec03bbe8SVladislav Vinogradov // of this extra attributes ans is supposed to be called after the dialect 50ec03bbe8SVladislav Vinogradov // conversion. 51ec03bbe8SVladislav Vinogradov // 52ec03bbe8SVladislav Vinogradov // TODO: Implement a cleaner solution, factoring out the "matching" logic 53ec03bbe8SVladislav Vinogradov // from the pattern and its callees into a separate function that can be called 54ec03bbe8SVladislav Vinogradov // from both the pattern and the op legality check. 55ec03bbe8SVladislav Vinogradov static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited"; 56ec03bbe8SVladislav Vinogradov 574ead2cf7SAlex Zinenko // Extract an indexed value from KernelDim3. 584ead2cf7SAlex Zinenko static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 594ead2cf7SAlex Zinenko switch (pos) { 604ead2cf7SAlex Zinenko case 0: 614ead2cf7SAlex Zinenko return dim3.x; 624ead2cf7SAlex Zinenko case 1: 634ead2cf7SAlex Zinenko return dim3.y; 644ead2cf7SAlex Zinenko case 2: 654ead2cf7SAlex Zinenko return dim3.z; 664ead2cf7SAlex Zinenko default: 674ead2cf7SAlex Zinenko llvm_unreachable("dim3 position out of bounds"); 684ead2cf7SAlex Zinenko } 694ead2cf7SAlex Zinenko return nullptr; 704ead2cf7SAlex Zinenko } 714ead2cf7SAlex Zinenko 724ead2cf7SAlex Zinenko // Get the lower bound-related operands of a loop operation. 734ead2cf7SAlex Zinenko static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 744ead2cf7SAlex Zinenko return forOp.getLowerBoundOperands(); 754ead2cf7SAlex Zinenko } 764ead2cf7SAlex Zinenko 774ead2cf7SAlex Zinenko // Get the upper bound-related operands of a loop operation. 784ead2cf7SAlex Zinenko static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 794ead2cf7SAlex Zinenko return forOp.getUpperBoundOperands(); 804ead2cf7SAlex Zinenko } 814ead2cf7SAlex Zinenko 824ead2cf7SAlex Zinenko // Get a Value that corresponds to the loop step. If the step is an attribute, 834ead2cf7SAlex Zinenko // materialize a corresponding constant using builder. 844ead2cf7SAlex Zinenko static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 85a54f4eaeSMogball return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), 86a54f4eaeSMogball forOp.getStep()); 874ead2cf7SAlex Zinenko } 884ead2cf7SAlex Zinenko 894ead2cf7SAlex Zinenko // Get a Value for the loop lower bound. If the value requires computation, 904ead2cf7SAlex Zinenko // materialize the instructions using builder. 914ead2cf7SAlex Zinenko static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 924ead2cf7SAlex Zinenko return lowerAffineLowerBound(forOp, builder); 934ead2cf7SAlex Zinenko } 944ead2cf7SAlex Zinenko 954ead2cf7SAlex Zinenko // Get a Value for the loop upper bound. If the value requires computation, 964ead2cf7SAlex Zinenko // materialize the instructions using builder. 974ead2cf7SAlex Zinenko static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 984ead2cf7SAlex Zinenko return lowerAffineUpperBound(forOp, builder); 994ead2cf7SAlex Zinenko } 1004ead2cf7SAlex Zinenko 1014ead2cf7SAlex Zinenko // Check the structure of the loop nest: 1024ead2cf7SAlex Zinenko // - there are enough loops to map to numDims; 1034ead2cf7SAlex Zinenko // - the loops are perfectly nested; 1044ead2cf7SAlex Zinenko // - the loop bounds can be computed above the outermost loop. 1054ead2cf7SAlex Zinenko // This roughly corresponds to the "matcher" part of the pattern-based 1064ead2cf7SAlex Zinenko // rewriting infrastructure. 1072bcd1927SMaheshRavishankar static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 1082bcd1927SMaheshRavishankar unsigned numDims) { 1094ead2cf7SAlex Zinenko Region &limit = forOp.region(); 1104ead2cf7SAlex Zinenko for (unsigned i = 0, e = numDims; i < e; ++i) { 1114ead2cf7SAlex Zinenko Operation *nested = &forOp.getBody()->front(); 1124ead2cf7SAlex Zinenko if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 1134ead2cf7SAlex Zinenko !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 1144ead2cf7SAlex Zinenko return forOp.emitError( 1154ead2cf7SAlex Zinenko "loops with bounds depending on other mapped loops " 1164ead2cf7SAlex Zinenko "are not supported"); 1174ead2cf7SAlex Zinenko 1184ead2cf7SAlex Zinenko // The innermost loop can have an arbitrary body, skip the perfect nesting 1194ead2cf7SAlex Zinenko // check for it. 1204ead2cf7SAlex Zinenko if (i == e - 1) 1214ead2cf7SAlex Zinenko break; 1224ead2cf7SAlex Zinenko 1234ead2cf7SAlex Zinenko auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 1244ead2cf7SAlex Zinenko if (forOp.getBody()->empty() || std::next(begin, 2) != end) 1254ead2cf7SAlex Zinenko return forOp.emitError("expected perfectly nested loops in the body"); 1264ead2cf7SAlex Zinenko 1272bcd1927SMaheshRavishankar if (!(forOp = dyn_cast<AffineForOp>(nested))) 1284ead2cf7SAlex Zinenko return nested->emitError("expected a nested loop"); 1294ead2cf7SAlex Zinenko } 1304ead2cf7SAlex Zinenko return success(); 1314ead2cf7SAlex Zinenko } 1324ead2cf7SAlex Zinenko 1332bcd1927SMaheshRavishankar static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 1342bcd1927SMaheshRavishankar unsigned numBlockDims, 1354ead2cf7SAlex Zinenko unsigned numThreadDims) { 1364ead2cf7SAlex Zinenko if (numBlockDims < 1 || numThreadDims < 1) { 1374ead2cf7SAlex Zinenko LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 1384ead2cf7SAlex Zinenko return success(); 1394ead2cf7SAlex Zinenko } 1404ead2cf7SAlex Zinenko 1414ead2cf7SAlex Zinenko if (numBlockDims > 3) { 1424ead2cf7SAlex Zinenko return forOp.emitError("cannot map to more than 3 block dimensions"); 1434ead2cf7SAlex Zinenko } 1444ead2cf7SAlex Zinenko if (numThreadDims > 3) { 1454ead2cf7SAlex Zinenko return forOp.emitError("cannot map to more than 3 thread dimensions"); 1464ead2cf7SAlex Zinenko } 1472bcd1927SMaheshRavishankar return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 1484ead2cf7SAlex Zinenko } 1494ead2cf7SAlex Zinenko 1504ead2cf7SAlex Zinenko namespace { 1514ead2cf7SAlex Zinenko // Helper structure that holds common state of the loop to GPU kernel 1524ead2cf7SAlex Zinenko // conversion. 1532bcd1927SMaheshRavishankar struct AffineLoopToGpuConverter { 1542bcd1927SMaheshRavishankar Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops); 1554ead2cf7SAlex Zinenko 1562bcd1927SMaheshRavishankar void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 1572bcd1927SMaheshRavishankar unsigned numBlockDims, unsigned numThreadDims); 1584ead2cf7SAlex Zinenko 1594ead2cf7SAlex Zinenko // Ranges of the loops mapped to blocks or threads. 1604ead2cf7SAlex Zinenko SmallVector<Value, 6> dims; 1614ead2cf7SAlex Zinenko // Lower bounds of the loops mapped to blocks or threads. 1624ead2cf7SAlex Zinenko SmallVector<Value, 6> lbs; 1634ead2cf7SAlex Zinenko // Induction variables of the loops mapped to blocks or threads. 1644ead2cf7SAlex Zinenko SmallVector<Value, 6> ivs; 1654ead2cf7SAlex Zinenko // Steps of the loops mapped to blocks or threads. 1664ead2cf7SAlex Zinenko SmallVector<Value, 6> steps; 1674ead2cf7SAlex Zinenko }; 1684ead2cf7SAlex Zinenko } // namespace 1694ead2cf7SAlex Zinenko 1704ead2cf7SAlex Zinenko // Return true if the value is obviously a constant "one". 1714ead2cf7SAlex Zinenko static bool isConstantOne(Value value) { 172a54f4eaeSMogball if (auto def = value.getDefiningOp<arith::ConstantIndexOp>()) 173a54f4eaeSMogball return def.value() == 1; 1744ead2cf7SAlex Zinenko return false; 1754ead2cf7SAlex Zinenko } 1764ead2cf7SAlex Zinenko 1774ead2cf7SAlex Zinenko // Collect ranges, bounds, steps and induction variables in preparation for 1784ead2cf7SAlex Zinenko // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 1794ead2cf7SAlex Zinenko // This may fail if the IR for computing loop bounds cannot be constructed, for 1804ead2cf7SAlex Zinenko // example if an affine loop uses semi-affine maps. Return the last loop to be 1814ead2cf7SAlex Zinenko // mapped on success, llvm::None on failure. 1822bcd1927SMaheshRavishankar Optional<AffineForOp> 1832bcd1927SMaheshRavishankar AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 1844ead2cf7SAlex Zinenko OpBuilder builder(forOp.getOperation()); 1854ead2cf7SAlex Zinenko dims.reserve(numLoops); 1864ead2cf7SAlex Zinenko lbs.reserve(numLoops); 1874ead2cf7SAlex Zinenko ivs.reserve(numLoops); 1884ead2cf7SAlex Zinenko steps.reserve(numLoops); 1892bcd1927SMaheshRavishankar AffineForOp currentLoop = forOp; 1904ead2cf7SAlex Zinenko for (unsigned i = 0; i < numLoops; ++i) { 1914ead2cf7SAlex Zinenko Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 1924ead2cf7SAlex Zinenko Value upperBound = getOrEmitUpperBound(currentLoop, builder); 1934ead2cf7SAlex Zinenko if (!lowerBound || !upperBound) { 1944ead2cf7SAlex Zinenko return llvm::None; 1954ead2cf7SAlex Zinenko } 1964ead2cf7SAlex Zinenko 197a54f4eaeSMogball Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), 198a54f4eaeSMogball upperBound, lowerBound); 1994ead2cf7SAlex Zinenko Value step = getOrCreateStep(currentLoop, builder); 2004ead2cf7SAlex Zinenko if (!isConstantOne(step)) 201a54f4eaeSMogball range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step); 2024ead2cf7SAlex Zinenko dims.push_back(range); 2034ead2cf7SAlex Zinenko 2044ead2cf7SAlex Zinenko lbs.push_back(lowerBound); 2054ead2cf7SAlex Zinenko ivs.push_back(currentLoop.getInductionVar()); 2064ead2cf7SAlex Zinenko steps.push_back(step); 2074ead2cf7SAlex Zinenko 2084ead2cf7SAlex Zinenko if (i != numLoops - 1) 2092bcd1927SMaheshRavishankar currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 2104ead2cf7SAlex Zinenko } 2114ead2cf7SAlex Zinenko return currentLoop; 2124ead2cf7SAlex Zinenko } 2134ead2cf7SAlex Zinenko 2144ead2cf7SAlex Zinenko // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 2154ead2cf7SAlex Zinenko // "innermostForOp" to point to the last loop to be transformed to the kernel, 2164ead2cf7SAlex Zinenko // and to have (numBlockDims + numThreadDims) perfectly nested loops between 2174ead2cf7SAlex Zinenko // "rootForOp" and "innermostForOp". 2182bcd1927SMaheshRavishankar void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 2192bcd1927SMaheshRavishankar AffineForOp innermostForOp, 2204ead2cf7SAlex Zinenko unsigned numBlockDims, 2214ead2cf7SAlex Zinenko unsigned numThreadDims) { 2224ead2cf7SAlex Zinenko OpBuilder builder(rootForOp.getOperation()); 2234ead2cf7SAlex Zinenko // Prepare the grid and block sizes for the launch operation. If there is 2244ead2cf7SAlex Zinenko // no loop mapped to a specific dimension, use constant "1" as its size. 225a54f4eaeSMogball Value constOne = 226a54f4eaeSMogball (numBlockDims < 3 || numThreadDims < 3) 227a54f4eaeSMogball ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) 2284ead2cf7SAlex Zinenko : nullptr; 2294ead2cf7SAlex Zinenko Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 2304ead2cf7SAlex Zinenko Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 2314ead2cf7SAlex Zinenko Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 2324ead2cf7SAlex Zinenko Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 2334ead2cf7SAlex Zinenko Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 2344ead2cf7SAlex Zinenko Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 2354ead2cf7SAlex Zinenko 2364ead2cf7SAlex Zinenko // Create a launch op and move the body region of the innermost loop to the 2374ead2cf7SAlex Zinenko // launch op. 2384ead2cf7SAlex Zinenko auto launchOp = builder.create<gpu::LaunchOp>( 2394ead2cf7SAlex Zinenko rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 2404ead2cf7SAlex Zinenko blockSizeY, blockSizeZ); 2414ead2cf7SAlex Zinenko 2424ead2cf7SAlex Zinenko // Replace the loop terminator (loops contain only a single block) with the 2434ead2cf7SAlex Zinenko // gpu terminator and move the operations from the loop body block to the gpu 2444ead2cf7SAlex Zinenko // launch body block. Do not move the entire block because of the difference 2454ead2cf7SAlex Zinenko // in block arguments. 2464ead2cf7SAlex Zinenko Operation &terminator = innermostForOp.getBody()->back(); 2474ead2cf7SAlex Zinenko Location terminatorLoc = terminator.getLoc(); 2484ead2cf7SAlex Zinenko terminator.erase(); 2494ead2cf7SAlex Zinenko builder.setInsertionPointToEnd(innermostForOp.getBody()); 2504ead2cf7SAlex Zinenko builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None); 2514ead2cf7SAlex Zinenko launchOp.body().front().getOperations().splice( 2524ead2cf7SAlex Zinenko launchOp.body().front().begin(), 2534ead2cf7SAlex Zinenko innermostForOp.getBody()->getOperations()); 2544ead2cf7SAlex Zinenko 2554ead2cf7SAlex Zinenko // Remap the loop iterators to use block/thread identifiers instead. Loops 2564ead2cf7SAlex Zinenko // may iterate from LB with step S whereas GPU thread/block ids always iterate 2574ead2cf7SAlex Zinenko // from 0 to N with step 1. Therefore, loop induction variables are replaced 2584ead2cf7SAlex Zinenko // with (gpu-thread/block-id * S) + LB. 2594ead2cf7SAlex Zinenko builder.setInsertionPointToStart(&launchOp.body().front()); 26002b6fb21SMehdi Amini auto *lbArgumentIt = lbs.begin(); 26102b6fb21SMehdi Amini auto *stepArgumentIt = steps.begin(); 262e4853be2SMehdi Amini for (const auto &en : llvm::enumerate(ivs)) { 2634ead2cf7SAlex Zinenko Value id = 2644ead2cf7SAlex Zinenko en.index() < numBlockDims 2654ead2cf7SAlex Zinenko ? getDim3Value(launchOp.getBlockIds(), en.index()) 2664ead2cf7SAlex Zinenko : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 2674ead2cf7SAlex Zinenko Value step = steps[en.index()]; 2684ead2cf7SAlex Zinenko if (!isConstantOne(step)) 269a54f4eaeSMogball id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id); 2704ead2cf7SAlex Zinenko 2714ead2cf7SAlex Zinenko Value ivReplacement = 272a54f4eaeSMogball builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 2734ead2cf7SAlex Zinenko en.value().replaceAllUsesWith(ivReplacement); 2744ead2cf7SAlex Zinenko std::advance(lbArgumentIt, 1); 2754ead2cf7SAlex Zinenko std::advance(stepArgumentIt, 1); 2764ead2cf7SAlex Zinenko } 2774ead2cf7SAlex Zinenko 2784ead2cf7SAlex Zinenko // We are done and can erase the original outermost loop. 2794ead2cf7SAlex Zinenko rootForOp.erase(); 2804ead2cf7SAlex Zinenko } 2814ead2cf7SAlex Zinenko 2824ead2cf7SAlex Zinenko // Generic loop to GPU kernel conversion function. 2832bcd1927SMaheshRavishankar static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 2844ead2cf7SAlex Zinenko unsigned numBlockDims, 2854ead2cf7SAlex Zinenko unsigned numThreadDims) { 2862bcd1927SMaheshRavishankar if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 2874ead2cf7SAlex Zinenko return failure(); 2884ead2cf7SAlex Zinenko 2892bcd1927SMaheshRavishankar AffineLoopToGpuConverter converter; 2904ead2cf7SAlex Zinenko auto maybeInnerLoop = 2914ead2cf7SAlex Zinenko converter.collectBounds(forOp, numBlockDims + numThreadDims); 2924ead2cf7SAlex Zinenko if (!maybeInnerLoop) 2934ead2cf7SAlex Zinenko return failure(); 2944ead2cf7SAlex Zinenko converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 2954ead2cf7SAlex Zinenko 2964ead2cf7SAlex Zinenko return success(); 2974ead2cf7SAlex Zinenko } 2984ead2cf7SAlex Zinenko 2994ead2cf7SAlex Zinenko LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 3004ead2cf7SAlex Zinenko unsigned numBlockDims, 3014ead2cf7SAlex Zinenko unsigned numThreadDims) { 3022bcd1927SMaheshRavishankar return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 3034ead2cf7SAlex Zinenko } 3044ead2cf7SAlex Zinenko 3054ead2cf7SAlex Zinenko namespace { 3064ead2cf7SAlex Zinenko struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 3074ead2cf7SAlex Zinenko using OpRewritePattern<ParallelOp>::OpRewritePattern; 3084ead2cf7SAlex Zinenko 3094ead2cf7SAlex Zinenko LogicalResult matchAndRewrite(ParallelOp parallelOp, 3104ead2cf7SAlex Zinenko PatternRewriter &rewriter) const override; 3114ead2cf7SAlex Zinenko }; 3124ead2cf7SAlex Zinenko } // namespace 3134ead2cf7SAlex Zinenko 3144ead2cf7SAlex Zinenko /// Tries to derive a static upper bound from the defining operation of 3154ead2cf7SAlex Zinenko /// `upperBound`. 3164ead2cf7SAlex Zinenko static Value deriveStaticUpperBound(Value upperBound, 3174ead2cf7SAlex Zinenko PatternRewriter &rewriter) { 318a54f4eaeSMogball if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) { 3194ead2cf7SAlex Zinenko return op; 3204ead2cf7SAlex Zinenko } 3214ead2cf7SAlex Zinenko 3224ead2cf7SAlex Zinenko if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 3234ead2cf7SAlex Zinenko for (const AffineExpr &result : minOp.map().getResults()) { 3244ead2cf7SAlex Zinenko if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) { 325a54f4eaeSMogball return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), 3264ead2cf7SAlex Zinenko constExpr.getValue()); 3274ead2cf7SAlex Zinenko } 3284ead2cf7SAlex Zinenko } 3294ead2cf7SAlex Zinenko } 3304ead2cf7SAlex Zinenko 331a54f4eaeSMogball if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) { 332a54f4eaeSMogball if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>( 3334ead2cf7SAlex Zinenko deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 3344ead2cf7SAlex Zinenko .getDefiningOp())) 335a54f4eaeSMogball if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>( 3364ead2cf7SAlex Zinenko deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 3374ead2cf7SAlex Zinenko .getDefiningOp())) { 3384ead2cf7SAlex Zinenko // Assumptions about the upper bound of minimum computations no longer 3394ead2cf7SAlex Zinenko // work if multiplied by a negative value, so abort in this case. 340a54f4eaeSMogball if (lhs.value() < 0 || rhs.value() < 0) 3414ead2cf7SAlex Zinenko return {}; 3424ead2cf7SAlex Zinenko 343a54f4eaeSMogball return rewriter.create<arith::ConstantIndexOp>( 344a54f4eaeSMogball multiplyOp.getLoc(), lhs.value() * rhs.value()); 3454ead2cf7SAlex Zinenko } 3464ead2cf7SAlex Zinenko } 3474ead2cf7SAlex Zinenko 3484ead2cf7SAlex Zinenko return {}; 3494ead2cf7SAlex Zinenko } 3504ead2cf7SAlex Zinenko 3514ead2cf7SAlex Zinenko static bool isMappedToProcessor(gpu::Processor processor) { 3524ead2cf7SAlex Zinenko return processor != gpu::Processor::Sequential; 3534ead2cf7SAlex Zinenko } 3544ead2cf7SAlex Zinenko 3554ead2cf7SAlex Zinenko static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 3564ead2cf7SAlex Zinenko switch (processor) { 3574ead2cf7SAlex Zinenko case gpu::Processor::BlockX: 3584ead2cf7SAlex Zinenko return 0; 3594ead2cf7SAlex Zinenko case gpu::Processor::BlockY: 3604ead2cf7SAlex Zinenko return 1; 3614ead2cf7SAlex Zinenko case gpu::Processor::BlockZ: 3624ead2cf7SAlex Zinenko return 2; 3634ead2cf7SAlex Zinenko case gpu::Processor::ThreadX: 3644ead2cf7SAlex Zinenko return 3; 3654ead2cf7SAlex Zinenko case gpu::Processor::ThreadY: 3664ead2cf7SAlex Zinenko return 4; 3674ead2cf7SAlex Zinenko case gpu::Processor::ThreadZ: 3684ead2cf7SAlex Zinenko return 5; 3694ead2cf7SAlex Zinenko default:; 3704ead2cf7SAlex Zinenko } 3714ead2cf7SAlex Zinenko llvm_unreachable( 3724ead2cf7SAlex Zinenko "invalid processor type while retrieving launch op argument number"); 3734ead2cf7SAlex Zinenko } 3744ead2cf7SAlex Zinenko 3754ead2cf7SAlex Zinenko /// Modifies the current transformation state to capture the effect of the given 3764ead2cf7SAlex Zinenko /// `scf.parallel` operation on index substitutions and the operations to be 3774ead2cf7SAlex Zinenko /// inserted. 3784ead2cf7SAlex Zinenko /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 3794ead2cf7SAlex Zinenko /// this function will 3804ead2cf7SAlex Zinenko /// - compute the loop index based on the hardware id and affine map from the 3814ead2cf7SAlex Zinenko /// mapping and update `cloningMap` to substitute all uses. 3824ead2cf7SAlex Zinenko /// - derive a new upper bound for the hardware id and augment the provided 3834ead2cf7SAlex Zinenko /// `gpu.launch operation` accordingly. 3844ead2cf7SAlex Zinenko /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 3854ead2cf7SAlex Zinenko /// and update the rewriter to insert into the conditional's body. 3864ead2cf7SAlex Zinenko /// If the dimension is mapped to sequential, 3874ead2cf7SAlex Zinenko /// - insert a for loop into the body and update the rewriter to insert into 3884ead2cf7SAlex Zinenko /// the for loop's body. 3894ead2cf7SAlex Zinenko /// - update the `cloningMap` to replace uses of the index with the index of 3904ead2cf7SAlex Zinenko /// the new for loop. 3914ead2cf7SAlex Zinenko /// In either case, 3924ead2cf7SAlex Zinenko /// - append the instructions from the loops body to worklist, in reverse order. 3934ead2cf7SAlex Zinenko /// To note the end of the current scope in case a loop or conditional was 3944ead2cf7SAlex Zinenko /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 3954ead2cf7SAlex Zinenko /// worklist. This signals the processor of the worklist to pop the rewriter 3964ead2cf7SAlex Zinenko /// one scope-level up. 3974ead2cf7SAlex Zinenko static LogicalResult processParallelLoop( 3984ead2cf7SAlex Zinenko ParallelOp parallelOp, gpu::LaunchOp launchOp, 3994ead2cf7SAlex Zinenko BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, 4004ead2cf7SAlex Zinenko DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 4019db53a18SRiver Riddle // TODO: Verify that this is a valid GPU mapping. 4024ead2cf7SAlex Zinenko // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 4034ead2cf7SAlex Zinenko ArrayAttr mapping = 4040bf4a82aSChristian Sigg parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 4054ead2cf7SAlex Zinenko 4069db53a18SRiver Riddle // TODO: Support reductions. 4074ead2cf7SAlex Zinenko if (!mapping || parallelOp.getNumResults() != 0) 4084ead2cf7SAlex Zinenko return failure(); 4094ead2cf7SAlex Zinenko 4104ead2cf7SAlex Zinenko Location loc = parallelOp.getLoc(); 4114ead2cf7SAlex Zinenko 4124ead2cf7SAlex Zinenko auto launchIndependent = [&launchOp](Value val) { 4130bf4a82aSChristian Sigg return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 4144ead2cf7SAlex Zinenko }; 4154ead2cf7SAlex Zinenko 4164ead2cf7SAlex Zinenko auto ensureLaunchIndependent = [&rewriter, 4174ead2cf7SAlex Zinenko launchIndependent](Value val) -> Value { 4184ead2cf7SAlex Zinenko if (launchIndependent(val)) 4194ead2cf7SAlex Zinenko return val; 420a54f4eaeSMogball if (auto constOp = val.getDefiningOp<arith::ConstantOp>()) 421a54f4eaeSMogball return rewriter.create<arith::ConstantOp>(constOp.getLoc(), 422cfb72fd3SJacques Pienaar constOp.getValue()); 4234ead2cf7SAlex Zinenko return {}; 4244ead2cf7SAlex Zinenko }; 4254ead2cf7SAlex Zinenko 426c0342a2dSJacques Pienaar for (auto config : llvm::zip( 427c0342a2dSJacques Pienaar mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(), 428c0342a2dSJacques Pienaar parallelOp.getUpperBound(), parallelOp.getStep())) { 4294ead2cf7SAlex Zinenko Attribute mappingAttribute; 4304ead2cf7SAlex Zinenko Value iv, lowerBound, upperBound, step; 4314ead2cf7SAlex Zinenko std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 4327bdd3722SMogball auto annotation = 4337bdd3722SMogball mappingAttribute.dyn_cast<gpu::ParallelLoopDimMappingAttr>(); 4344ead2cf7SAlex Zinenko if (!annotation) 4354ead2cf7SAlex Zinenko return parallelOp.emitOpError() 4364ead2cf7SAlex Zinenko << "expected mapping attribute for lowering to GPU"; 4374ead2cf7SAlex Zinenko Value newIndex; 4387bdd3722SMogball gpu::Processor processor = annotation.getProcessor(); 4394ead2cf7SAlex Zinenko 4404ead2cf7SAlex Zinenko if (isMappedToProcessor(processor)) { 4414ead2cf7SAlex Zinenko // Use the corresponding thread/grid index as replacement for the loop iv. 442e2b71610SRahul Joshi Value operand = 443e2b71610SRahul Joshi launchOp.body().getArgument(getLaunchOpArgumentNum(processor)); 4444ead2cf7SAlex Zinenko // Take the indexmap and add the lower bound and step computations in. 4454ead2cf7SAlex Zinenko // This computes operand * step + lowerBound. 4464ead2cf7SAlex Zinenko // Use an affine map here so that it composes nicely with the provided 4474ead2cf7SAlex Zinenko // annotation. 4484ead2cf7SAlex Zinenko AffineMap lowerAndStep = AffineMap::get( 4494ead2cf7SAlex Zinenko 1, 2, 4504ead2cf7SAlex Zinenko rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 4514ead2cf7SAlex Zinenko rewriter.getAffineSymbolExpr(1)); 4524ead2cf7SAlex Zinenko newIndex = rewriter.create<AffineApplyOp>( 4537bdd3722SMogball loc, annotation.getMap().compose(lowerAndStep), 4544ead2cf7SAlex Zinenko ValueRange{operand, step, lowerBound}); 4554ead2cf7SAlex Zinenko // If there was also a bound, insert that, too. 4569db53a18SRiver Riddle // TODO: Check that we do not assign bounds twice. 4577bdd3722SMogball if (annotation.getBound()) { 4584ead2cf7SAlex Zinenko // We pass as the single operand to the bound-map the number of 4594ead2cf7SAlex Zinenko // iterations, which is (upperBound - lowerBound) ceilDiv step. To 4604ead2cf7SAlex Zinenko // support inner loops with dynamic upper bounds (as generated by e.g. 4614ead2cf7SAlex Zinenko // tiling), try to derive a max for the bounds. If the used bound for 4624ead2cf7SAlex Zinenko // the hardware id is imprecise, wrap the contained code into a 4634ead2cf7SAlex Zinenko // conditional. If the lower-bound is constant or defined before the 4644ead2cf7SAlex Zinenko // launch, we can use it in the launch bounds. Otherwise fail. 4654ead2cf7SAlex Zinenko if (!launchIndependent(lowerBound) && 466a54f4eaeSMogball !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp())) 4674ead2cf7SAlex Zinenko return failure(); 4684ead2cf7SAlex Zinenko // The step must also be constant or defined outside of the loop nest. 4694ead2cf7SAlex Zinenko if (!launchIndependent(step) && 470a54f4eaeSMogball !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp())) 4714ead2cf7SAlex Zinenko return failure(); 4724ead2cf7SAlex Zinenko // If the upper-bound is constant or defined before the launch, we can 4734ead2cf7SAlex Zinenko // use it in the launch bounds directly. Otherwise try derive a bound. 4744ead2cf7SAlex Zinenko bool boundIsPrecise = 4754ead2cf7SAlex Zinenko launchIndependent(upperBound) || 476a54f4eaeSMogball isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp()); 4774ead2cf7SAlex Zinenko { 4784ead2cf7SAlex Zinenko PatternRewriter::InsertionGuard guard(rewriter); 4794ead2cf7SAlex Zinenko rewriter.setInsertionPoint(launchOp); 4804ead2cf7SAlex Zinenko if (!boundIsPrecise) { 4814ead2cf7SAlex Zinenko upperBound = deriveStaticUpperBound(upperBound, rewriter); 4824ead2cf7SAlex Zinenko if (!upperBound) { 4835da2423bSStephan Herhut return rewriter.notifyMatchFailure( 4845da2423bSStephan Herhut parallelOp, 4855da2423bSStephan Herhut "cannot derive loop-invariant upper bound for number of" 4865da2423bSStephan Herhut "iterations"); 4874ead2cf7SAlex Zinenko } 4884ead2cf7SAlex Zinenko } 4894ead2cf7SAlex Zinenko // Compute the number of iterations needed. We compute this as an 4904ead2cf7SAlex Zinenko // affine expression ceilDiv (upperBound - lowerBound) step. We use 4914ead2cf7SAlex Zinenko // affine.apply here so that it composes nicely with the provided map. 49272d5ac90STres Popp AffineMap stepMap = AffineMap::get( 49372d5ac90STres Popp 1, 2, 49472d5ac90STres Popp ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 49572d5ac90STres Popp .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 4964ead2cf7SAlex Zinenko Value launchBound = rewriter.create<AffineApplyOp>( 4977bdd3722SMogball loc, annotation.getBound().compose(stepMap), 4984ead2cf7SAlex Zinenko ValueRange{ 4994ead2cf7SAlex Zinenko ensureLaunchIndependent( 5004ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(upperBound)), 5014ead2cf7SAlex Zinenko ensureLaunchIndependent( 5024ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(lowerBound)), 5034ead2cf7SAlex Zinenko ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 5044ead2cf7SAlex Zinenko // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 5054ead2cf7SAlex Zinenko // when this condition is relaxed. 5064ead2cf7SAlex Zinenko if (bounds.find(processor) != bounds.end()) { 5075da2423bSStephan Herhut return rewriter.notifyMatchFailure( 5085da2423bSStephan Herhut parallelOp, "cannot redefine the bound for processor " + 5095da2423bSStephan Herhut Twine(static_cast<int64_t>(processor))); 5104ead2cf7SAlex Zinenko } 5114ead2cf7SAlex Zinenko bounds[processor] = launchBound; 5124ead2cf7SAlex Zinenko } 5134ead2cf7SAlex Zinenko if (!boundIsPrecise) { 5144ead2cf7SAlex Zinenko // We are using an approximation, create a surrounding conditional. 5154ead2cf7SAlex Zinenko Value originalBound = std::get<3>(config); 516a54f4eaeSMogball arith::CmpIOp pred = rewriter.create<arith::CmpIOp>( 517a54f4eaeSMogball loc, arith::CmpIPredicate::slt, newIndex, 5184ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(originalBound)); 5194ead2cf7SAlex Zinenko scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 520c0342a2dSJacques Pienaar rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); 5214ead2cf7SAlex Zinenko // Put a sentinel into the worklist so we know when to pop out of the 5224ead2cf7SAlex Zinenko // if body again. We use the launchOp here, as that cannot be part of 5234ead2cf7SAlex Zinenko // the bodies instruction. 5244ead2cf7SAlex Zinenko worklist.push_back(launchOp.getOperation()); 5254ead2cf7SAlex Zinenko } 5264ead2cf7SAlex Zinenko } 5274ead2cf7SAlex Zinenko } else { 5284ead2cf7SAlex Zinenko // Create a sequential for loop. 5294ead2cf7SAlex Zinenko auto loopOp = rewriter.create<scf::ForOp>( 5304ead2cf7SAlex Zinenko loc, cloningMap.lookupOrDefault(lowerBound), 5314ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(upperBound), 5324ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(step)); 5334ead2cf7SAlex Zinenko newIndex = loopOp.getInductionVar(); 5344ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(loopOp.getBody()); 5354ead2cf7SAlex Zinenko // Put a sentinel into the worklist so we know when to pop out of the loop 5364ead2cf7SAlex Zinenko // body again. We use the launchOp here, as that cannot be part of the 5374ead2cf7SAlex Zinenko // bodies instruction. 5384ead2cf7SAlex Zinenko worklist.push_back(launchOp.getOperation()); 5394ead2cf7SAlex Zinenko } 5404ead2cf7SAlex Zinenko cloningMap.map(iv, newIndex); 5414ead2cf7SAlex Zinenko } 542396e7f45SArtur Bialas 543396e7f45SArtur Bialas // Propagate custom user defined optional attributes, that can be used at 544396e7f45SArtur Bialas // later stage, such as extension data for GPU kernel dispatch 54556774bddSMarius Brehler for (const auto &namedAttr : parallelOp->getAttrs()) { 5460c7890c8SRiver Riddle if (namedAttr.getName() == gpu::getMappingAttrName() || 5470c7890c8SRiver Riddle namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr()) 548396e7f45SArtur Bialas continue; 5490c7890c8SRiver Riddle launchOp->setAttr(namedAttr.getName(), namedAttr.getValue()); 550396e7f45SArtur Bialas } 551396e7f45SArtur Bialas 5524ead2cf7SAlex Zinenko Block *body = parallelOp.getBody(); 5534ead2cf7SAlex Zinenko worklist.reserve(worklist.size() + body->getOperations().size()); 5544ead2cf7SAlex Zinenko for (Operation &op : llvm::reverse(body->without_terminator())) 5554ead2cf7SAlex Zinenko worklist.push_back(&op); 5564ead2cf7SAlex Zinenko return success(); 5574ead2cf7SAlex Zinenko } 5584ead2cf7SAlex Zinenko 5594ead2cf7SAlex Zinenko /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 5604ead2cf7SAlex Zinenko /// operation. 5614ead2cf7SAlex Zinenko /// 5624ead2cf7SAlex Zinenko /// This essentially transforms a loop nest into a corresponding SIMT function. 5634ead2cf7SAlex Zinenko /// The conversion is driven by mapping annotations on the `scf.parallel` 5644ead2cf7SAlex Zinenko /// operations. The mapping is provided via a `DictionaryAttribute` named 5654ead2cf7SAlex Zinenko /// `mapping`, which has three entries: 5664ead2cf7SAlex Zinenko /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 5674ead2cf7SAlex Zinenko /// thread dimensions and 6 is sequential. 5684ead2cf7SAlex Zinenko /// - map : An affine map that is used to pre-process hardware ids before 5694ead2cf7SAlex Zinenko /// substitution. 5704ead2cf7SAlex Zinenko /// - bound : An affine map that is used to compute the bound of the hardware 5714ead2cf7SAlex Zinenko /// id based on an upper bound of the number of iterations. 5724ead2cf7SAlex Zinenko /// If the `scf.parallel` contains nested `scf.parallel` operations, those 5734ead2cf7SAlex Zinenko /// need to be annotated, as well. Structurally, the transformation works by 5744ead2cf7SAlex Zinenko /// splicing all operations from nested `scf.parallel` operations into a single 5754ead2cf7SAlex Zinenko /// sequence. Indices mapped to hardware ids are substituted with those ids, 5764ead2cf7SAlex Zinenko /// wheras sequential mappings result in a sequential for-loop. To have more 5774ead2cf7SAlex Zinenko /// flexibility when mapping code to hardware ids, the transform supports two 5784ead2cf7SAlex Zinenko /// affine maps. The first `map` is used to compute the actual index for 5794ead2cf7SAlex Zinenko /// substitution from the hardware id. The second `bound` is used to compute the 5804ead2cf7SAlex Zinenko /// launch dimension for the hardware id from the number of iterations the 5814ead2cf7SAlex Zinenko /// mapped loop is performing. Note that the number of iterations might be 5824ead2cf7SAlex Zinenko /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 5834ead2cf7SAlex Zinenko /// the hardware id might iterate over additional indices. The transformation 5844ead2cf7SAlex Zinenko /// caters for this by predicating the created sequence of instructions on 5854ead2cf7SAlex Zinenko /// the actual loop bound. This only works if an static upper bound for the 5864ead2cf7SAlex Zinenko /// dynamic loop bound can be derived, currently via analyzing `affine.min` 5874ead2cf7SAlex Zinenko /// operations. 5884ead2cf7SAlex Zinenko LogicalResult 5894ead2cf7SAlex Zinenko ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 5904ead2cf7SAlex Zinenko PatternRewriter &rewriter) const { 591ec03bbe8SVladislav Vinogradov // Mark the operation as visited for recursive legality check. 592ec03bbe8SVladislav Vinogradov parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr()); 593ec03bbe8SVladislav Vinogradov 5945da2423bSStephan Herhut // We can only transform starting at the outer-most loop. Launches inside of 5955da2423bSStephan Herhut // parallel loops are not supported. 5960bf4a82aSChristian Sigg if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 5975da2423bSStephan Herhut return failure(); 5984ead2cf7SAlex Zinenko // Create a launch operation. We start with bound one for all grid/block 5994ead2cf7SAlex Zinenko // sizes. Those will be refined later as we discover them from mappings. 6004ead2cf7SAlex Zinenko Location loc = parallelOp.getLoc(); 601a54f4eaeSMogball Value constantOne = 602a54f4eaeSMogball rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1); 6034ead2cf7SAlex Zinenko gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 6044ead2cf7SAlex Zinenko parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 6054ead2cf7SAlex Zinenko constantOne, constantOne); 6064ead2cf7SAlex Zinenko rewriter.setInsertionPointToEnd(&launchOp.body().front()); 6074ead2cf7SAlex Zinenko rewriter.create<gpu::TerminatorOp>(loc); 6084ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(&launchOp.body().front()); 6094ead2cf7SAlex Zinenko 6104ead2cf7SAlex Zinenko BlockAndValueMapping cloningMap; 6114ead2cf7SAlex Zinenko llvm::DenseMap<gpu::Processor, Value> launchBounds; 6124ead2cf7SAlex Zinenko SmallVector<Operation *, 16> worklist; 6134ead2cf7SAlex Zinenko if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 6144ead2cf7SAlex Zinenko launchBounds, rewriter))) 6154ead2cf7SAlex Zinenko return failure(); 6164ead2cf7SAlex Zinenko 6174ead2cf7SAlex Zinenko // Whether we have seen any side-effects. Reset when leaving an inner scope. 6184ead2cf7SAlex Zinenko bool seenSideeffects = false; 6194ead2cf7SAlex Zinenko // Whether we have left a nesting scope (and hence are no longer innermost). 6204ead2cf7SAlex Zinenko bool leftNestingScope = false; 6214ead2cf7SAlex Zinenko while (!worklist.empty()) { 6224ead2cf7SAlex Zinenko Operation *op = worklist.pop_back_val(); 6234ead2cf7SAlex Zinenko // Now walk over the body and clone it. 6244ead2cf7SAlex Zinenko // TODO: This is only correct if there either is no further scf.parallel 6254ead2cf7SAlex Zinenko // nested or this code is side-effect free. Otherwise we might need 6264ead2cf7SAlex Zinenko // predication. We are overly conservative for now and only allow 6274ead2cf7SAlex Zinenko // side-effects in the innermost scope. 6284ead2cf7SAlex Zinenko if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 6294ead2cf7SAlex Zinenko // Before entering a nested scope, make sure there have been no 6304ead2cf7SAlex Zinenko // sideeffects until now. 6314ead2cf7SAlex Zinenko if (seenSideeffects) 6324ead2cf7SAlex Zinenko return failure(); 6334ead2cf7SAlex Zinenko // A nested scf.parallel needs insertion of code to compute indices. 6344ead2cf7SAlex Zinenko // Insert that now. This will also update the worklist with the loops 6354ead2cf7SAlex Zinenko // body. 6364ead2cf7SAlex Zinenko if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 6374ead2cf7SAlex Zinenko worklist, launchBounds, rewriter))) 6384ead2cf7SAlex Zinenko return failure(); 6394ead2cf7SAlex Zinenko } else if (op == launchOp.getOperation()) { 6404ead2cf7SAlex Zinenko // Found our sentinel value. We have finished the operations from one 6414ead2cf7SAlex Zinenko // nesting level, pop one level back up. 64202b6fb21SMehdi Amini auto *parent = rewriter.getInsertionPoint()->getParentOp(); 6434ead2cf7SAlex Zinenko rewriter.setInsertionPointAfter(parent); 6444ead2cf7SAlex Zinenko leftNestingScope = true; 6454ead2cf7SAlex Zinenko seenSideeffects = false; 6464ead2cf7SAlex Zinenko } else { 6474ead2cf7SAlex Zinenko // Otherwise we copy it over. 6484ead2cf7SAlex Zinenko Operation *clone = rewriter.clone(*op, cloningMap); 6494ead2cf7SAlex Zinenko cloningMap.map(op->getResults(), clone->getResults()); 6504ead2cf7SAlex Zinenko // Check for side effects. 6514ead2cf7SAlex Zinenko // TODO: Handle region side effects properly. 6524ead2cf7SAlex Zinenko seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) || 6534ead2cf7SAlex Zinenko clone->getNumRegions() != 0; 6544ead2cf7SAlex Zinenko // If we are no longer in the innermost scope, sideeffects are disallowed. 6554ead2cf7SAlex Zinenko if (seenSideeffects && leftNestingScope) 6564ead2cf7SAlex Zinenko return failure(); 6574ead2cf7SAlex Zinenko } 6584ead2cf7SAlex Zinenko } 6594ead2cf7SAlex Zinenko 6604ead2cf7SAlex Zinenko // Now that we succeeded creating the launch operation, also update the 6614ead2cf7SAlex Zinenko // bounds. 6624ead2cf7SAlex Zinenko for (auto bound : launchBounds) 6634ead2cf7SAlex Zinenko launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 6644ead2cf7SAlex Zinenko std::get<1>(bound)); 6654ead2cf7SAlex Zinenko 6664ead2cf7SAlex Zinenko rewriter.eraseOp(parallelOp); 6674ead2cf7SAlex Zinenko return success(); 6684ead2cf7SAlex Zinenko } 6694ead2cf7SAlex Zinenko 670dc4e913bSChris Lattner void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) { 671dc4e913bSChris Lattner patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext()); 6724ead2cf7SAlex Zinenko } 6735da2423bSStephan Herhut 6745da2423bSStephan Herhut void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 675e2310704SJulian Gross target.addLegalDialect<memref::MemRefDialect>(); 6765da2423bSStephan Herhut target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 677ec03bbe8SVladislav Vinogradov return !parallelOp->hasAttr(gpu::getMappingAttrName()) || 678ec03bbe8SVladislav Vinogradov parallelOp->hasAttr(kVisitedAttrName); 679ec03bbe8SVladislav Vinogradov }); 680ec03bbe8SVladislav Vinogradov } 681ec03bbe8SVladislav Vinogradov 682ec03bbe8SVladislav Vinogradov void mlir::finalizeParallelLoopToGPUConversion(Operation *op) { 683ec03bbe8SVladislav Vinogradov op->walk([](scf::ParallelOp parallelOp) { 684ec03bbe8SVladislav Vinogradov parallelOp->removeAttr(kVisitedAttrName); 6855da2423bSStephan Herhut }); 6865da2423bSStephan Herhut } 687