14ead2cf7SAlex Zinenko //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 24ead2cf7SAlex Zinenko // 34ead2cf7SAlex Zinenko // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 44ead2cf7SAlex Zinenko // See https://llvm.org/LICENSE.txt for license information. 54ead2cf7SAlex Zinenko // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 64ead2cf7SAlex Zinenko // 74ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===// 84ead2cf7SAlex Zinenko // 94ead2cf7SAlex Zinenko // This implements a straightforward conversion of an loop nest into a GPU 104ead2cf7SAlex Zinenko // kernel. The caller is expected to guarantee that the conversion is correct 114ead2cf7SAlex Zinenko // or to further transform the kernel to ensure correctness. 124ead2cf7SAlex Zinenko // 134ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===// 144ead2cf7SAlex Zinenko 154ead2cf7SAlex Zinenko #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 164ead2cf7SAlex Zinenko 174ead2cf7SAlex Zinenko #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 184ead2cf7SAlex Zinenko #include "mlir/Dialect/Affine/IR/AffineOps.h" 19*a54f4eaeSMogball #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 204ead2cf7SAlex Zinenko #include "mlir/Dialect/GPU/GPUDialect.h" 214ead2cf7SAlex Zinenko #include "mlir/Dialect/GPU/ParallelLoopMapper.h" 22e2310704SJulian Gross #include "mlir/Dialect/MemRef/IR/MemRef.h" 234ead2cf7SAlex Zinenko #include "mlir/Dialect/SCF/SCF.h" 244ead2cf7SAlex Zinenko #include "mlir/Dialect/StandardOps/IR/Ops.h" 254ead2cf7SAlex Zinenko #include "mlir/IR/AffineExpr.h" 264ead2cf7SAlex Zinenko #include "mlir/IR/BlockAndValueMapping.h" 274ead2cf7SAlex Zinenko #include "mlir/IR/Builders.h" 284ead2cf7SAlex Zinenko #include "mlir/Pass/Pass.h" 294ead2cf7SAlex Zinenko #include "mlir/Transforms/DialectConversion.h" 304ead2cf7SAlex Zinenko #include "mlir/Transforms/LoopUtils.h" 314ead2cf7SAlex Zinenko #include "mlir/Transforms/Passes.h" 324ead2cf7SAlex Zinenko #include "mlir/Transforms/RegionUtils.h" 334ead2cf7SAlex Zinenko #include "llvm/ADT/Sequence.h" 344ead2cf7SAlex Zinenko #include "llvm/Support/Debug.h" 354ead2cf7SAlex Zinenko 364ead2cf7SAlex Zinenko #define DEBUG_TYPE "loops-to-gpu" 374ead2cf7SAlex Zinenko 384ead2cf7SAlex Zinenko using namespace mlir; 394ead2cf7SAlex Zinenko using namespace mlir::scf; 404ead2cf7SAlex Zinenko 41ec03bbe8SVladislav Vinogradov // Name of internal attribute to mark visited operations during conversion. 42ec03bbe8SVladislav Vinogradov // 43ec03bbe8SVladislav Vinogradov // NOTE: The conversion originally used the following legality criteria: 44ec03bbe8SVladislav Vinogradov // `!parallelOp->hasAttr(gpu::getMappingAttrName())` 45ec03bbe8SVladislav Vinogradov // But the provided pattern might reject some cases based on more detailed 46ec03bbe8SVladislav Vinogradov // analysis of the `mapping` attribute. 47ec03bbe8SVladislav Vinogradov // To avoid dialect conversion failure due to non-converted illegal operation 48ec03bbe8SVladislav Vinogradov // we use this extra Unit attribute as a marker, that the operation was checked 49ec03bbe8SVladislav Vinogradov // by the pattern and is should be considered as legal in the following legality 50ec03bbe8SVladislav Vinogradov // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up 51ec03bbe8SVladislav Vinogradov // of this extra attributes ans is supposed to be called after the dialect 52ec03bbe8SVladislav Vinogradov // conversion. 53ec03bbe8SVladislav Vinogradov // 54ec03bbe8SVladislav Vinogradov // TODO: Implement a cleaner solution, factoring out the "matching" logic 55ec03bbe8SVladislav Vinogradov // from the pattern and its callees into a separate function that can be called 56ec03bbe8SVladislav Vinogradov // from both the pattern and the op legality check. 57ec03bbe8SVladislav Vinogradov static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited"; 58ec03bbe8SVladislav Vinogradov 594ead2cf7SAlex Zinenko // Extract an indexed value from KernelDim3. 604ead2cf7SAlex Zinenko static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 614ead2cf7SAlex Zinenko switch (pos) { 624ead2cf7SAlex Zinenko case 0: 634ead2cf7SAlex Zinenko return dim3.x; 644ead2cf7SAlex Zinenko case 1: 654ead2cf7SAlex Zinenko return dim3.y; 664ead2cf7SAlex Zinenko case 2: 674ead2cf7SAlex Zinenko return dim3.z; 684ead2cf7SAlex Zinenko default: 694ead2cf7SAlex Zinenko llvm_unreachable("dim3 position out of bounds"); 704ead2cf7SAlex Zinenko } 714ead2cf7SAlex Zinenko return nullptr; 724ead2cf7SAlex Zinenko } 734ead2cf7SAlex Zinenko 744ead2cf7SAlex Zinenko // Get the lower bound-related operands of a loop operation. 754ead2cf7SAlex Zinenko static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 764ead2cf7SAlex Zinenko return forOp.getLowerBoundOperands(); 774ead2cf7SAlex Zinenko } 784ead2cf7SAlex Zinenko 794ead2cf7SAlex Zinenko // Get the upper bound-related operands of a loop operation. 804ead2cf7SAlex Zinenko static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 814ead2cf7SAlex Zinenko return forOp.getUpperBoundOperands(); 824ead2cf7SAlex Zinenko } 834ead2cf7SAlex Zinenko 844ead2cf7SAlex Zinenko // Get a Value that corresponds to the loop step. If the step is an attribute, 854ead2cf7SAlex Zinenko // materialize a corresponding constant using builder. 864ead2cf7SAlex Zinenko static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 87*a54f4eaeSMogball return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), 88*a54f4eaeSMogball forOp.getStep()); 894ead2cf7SAlex Zinenko } 904ead2cf7SAlex Zinenko 914ead2cf7SAlex Zinenko // Get a Value for the loop lower bound. If the value requires computation, 924ead2cf7SAlex Zinenko // materialize the instructions using builder. 934ead2cf7SAlex Zinenko static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 944ead2cf7SAlex Zinenko return lowerAffineLowerBound(forOp, builder); 954ead2cf7SAlex Zinenko } 964ead2cf7SAlex Zinenko 974ead2cf7SAlex Zinenko // Get a Value for the loop upper bound. If the value requires computation, 984ead2cf7SAlex Zinenko // materialize the instructions using builder. 994ead2cf7SAlex Zinenko static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 1004ead2cf7SAlex Zinenko return lowerAffineUpperBound(forOp, builder); 1014ead2cf7SAlex Zinenko } 1024ead2cf7SAlex Zinenko 1034ead2cf7SAlex Zinenko // Check the structure of the loop nest: 1044ead2cf7SAlex Zinenko // - there are enough loops to map to numDims; 1054ead2cf7SAlex Zinenko // - the loops are perfectly nested; 1064ead2cf7SAlex Zinenko // - the loop bounds can be computed above the outermost loop. 1074ead2cf7SAlex Zinenko // This roughly corresponds to the "matcher" part of the pattern-based 1084ead2cf7SAlex Zinenko // rewriting infrastructure. 1092bcd1927SMaheshRavishankar static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 1102bcd1927SMaheshRavishankar unsigned numDims) { 1114ead2cf7SAlex Zinenko Region &limit = forOp.region(); 1124ead2cf7SAlex Zinenko for (unsigned i = 0, e = numDims; i < e; ++i) { 1134ead2cf7SAlex Zinenko Operation *nested = &forOp.getBody()->front(); 1144ead2cf7SAlex Zinenko if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 1154ead2cf7SAlex Zinenko !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 1164ead2cf7SAlex Zinenko return forOp.emitError( 1174ead2cf7SAlex Zinenko "loops with bounds depending on other mapped loops " 1184ead2cf7SAlex Zinenko "are not supported"); 1194ead2cf7SAlex Zinenko 1204ead2cf7SAlex Zinenko // The innermost loop can have an arbitrary body, skip the perfect nesting 1214ead2cf7SAlex Zinenko // check for it. 1224ead2cf7SAlex Zinenko if (i == e - 1) 1234ead2cf7SAlex Zinenko break; 1244ead2cf7SAlex Zinenko 1254ead2cf7SAlex Zinenko auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 1264ead2cf7SAlex Zinenko if (forOp.getBody()->empty() || std::next(begin, 2) != end) 1274ead2cf7SAlex Zinenko return forOp.emitError("expected perfectly nested loops in the body"); 1284ead2cf7SAlex Zinenko 1292bcd1927SMaheshRavishankar if (!(forOp = dyn_cast<AffineForOp>(nested))) 1304ead2cf7SAlex Zinenko return nested->emitError("expected a nested loop"); 1314ead2cf7SAlex Zinenko } 1324ead2cf7SAlex Zinenko return success(); 1334ead2cf7SAlex Zinenko } 1344ead2cf7SAlex Zinenko 1352bcd1927SMaheshRavishankar static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 1362bcd1927SMaheshRavishankar unsigned numBlockDims, 1374ead2cf7SAlex Zinenko unsigned numThreadDims) { 1384ead2cf7SAlex Zinenko if (numBlockDims < 1 || numThreadDims < 1) { 1394ead2cf7SAlex Zinenko LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 1404ead2cf7SAlex Zinenko return success(); 1414ead2cf7SAlex Zinenko } 1424ead2cf7SAlex Zinenko 1434ead2cf7SAlex Zinenko if (numBlockDims > 3) { 1444ead2cf7SAlex Zinenko return forOp.emitError("cannot map to more than 3 block dimensions"); 1454ead2cf7SAlex Zinenko } 1464ead2cf7SAlex Zinenko if (numThreadDims > 3) { 1474ead2cf7SAlex Zinenko return forOp.emitError("cannot map to more than 3 thread dimensions"); 1484ead2cf7SAlex Zinenko } 1492bcd1927SMaheshRavishankar return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 1504ead2cf7SAlex Zinenko } 1514ead2cf7SAlex Zinenko 1524ead2cf7SAlex Zinenko namespace { 1534ead2cf7SAlex Zinenko // Helper structure that holds common state of the loop to GPU kernel 1544ead2cf7SAlex Zinenko // conversion. 1552bcd1927SMaheshRavishankar struct AffineLoopToGpuConverter { 1562bcd1927SMaheshRavishankar Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops); 1574ead2cf7SAlex Zinenko 1582bcd1927SMaheshRavishankar void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 1592bcd1927SMaheshRavishankar unsigned numBlockDims, unsigned numThreadDims); 1604ead2cf7SAlex Zinenko 1614ead2cf7SAlex Zinenko // Ranges of the loops mapped to blocks or threads. 1624ead2cf7SAlex Zinenko SmallVector<Value, 6> dims; 1634ead2cf7SAlex Zinenko // Lower bounds of the loops mapped to blocks or threads. 1644ead2cf7SAlex Zinenko SmallVector<Value, 6> lbs; 1654ead2cf7SAlex Zinenko // Induction variables of the loops mapped to blocks or threads. 1664ead2cf7SAlex Zinenko SmallVector<Value, 6> ivs; 1674ead2cf7SAlex Zinenko // Steps of the loops mapped to blocks or threads. 1684ead2cf7SAlex Zinenko SmallVector<Value, 6> steps; 1694ead2cf7SAlex Zinenko }; 1704ead2cf7SAlex Zinenko } // namespace 1714ead2cf7SAlex Zinenko 1724ead2cf7SAlex Zinenko // Return true if the value is obviously a constant "one". 1734ead2cf7SAlex Zinenko static bool isConstantOne(Value value) { 174*a54f4eaeSMogball if (auto def = value.getDefiningOp<arith::ConstantIndexOp>()) 175*a54f4eaeSMogball return def.value() == 1; 1764ead2cf7SAlex Zinenko return false; 1774ead2cf7SAlex Zinenko } 1784ead2cf7SAlex Zinenko 1794ead2cf7SAlex Zinenko // Collect ranges, bounds, steps and induction variables in preparation for 1804ead2cf7SAlex Zinenko // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 1814ead2cf7SAlex Zinenko // This may fail if the IR for computing loop bounds cannot be constructed, for 1824ead2cf7SAlex Zinenko // example if an affine loop uses semi-affine maps. Return the last loop to be 1834ead2cf7SAlex Zinenko // mapped on success, llvm::None on failure. 1842bcd1927SMaheshRavishankar Optional<AffineForOp> 1852bcd1927SMaheshRavishankar AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 1864ead2cf7SAlex Zinenko OpBuilder builder(forOp.getOperation()); 1874ead2cf7SAlex Zinenko dims.reserve(numLoops); 1884ead2cf7SAlex Zinenko lbs.reserve(numLoops); 1894ead2cf7SAlex Zinenko ivs.reserve(numLoops); 1904ead2cf7SAlex Zinenko steps.reserve(numLoops); 1912bcd1927SMaheshRavishankar AffineForOp currentLoop = forOp; 1924ead2cf7SAlex Zinenko for (unsigned i = 0; i < numLoops; ++i) { 1934ead2cf7SAlex Zinenko Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 1944ead2cf7SAlex Zinenko Value upperBound = getOrEmitUpperBound(currentLoop, builder); 1954ead2cf7SAlex Zinenko if (!lowerBound || !upperBound) { 1964ead2cf7SAlex Zinenko return llvm::None; 1974ead2cf7SAlex Zinenko } 1984ead2cf7SAlex Zinenko 199*a54f4eaeSMogball Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), 200*a54f4eaeSMogball upperBound, lowerBound); 2014ead2cf7SAlex Zinenko Value step = getOrCreateStep(currentLoop, builder); 2024ead2cf7SAlex Zinenko if (!isConstantOne(step)) 203*a54f4eaeSMogball range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step); 2044ead2cf7SAlex Zinenko dims.push_back(range); 2054ead2cf7SAlex Zinenko 2064ead2cf7SAlex Zinenko lbs.push_back(lowerBound); 2074ead2cf7SAlex Zinenko ivs.push_back(currentLoop.getInductionVar()); 2084ead2cf7SAlex Zinenko steps.push_back(step); 2094ead2cf7SAlex Zinenko 2104ead2cf7SAlex Zinenko if (i != numLoops - 1) 2112bcd1927SMaheshRavishankar currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 2124ead2cf7SAlex Zinenko } 2134ead2cf7SAlex Zinenko return currentLoop; 2144ead2cf7SAlex Zinenko } 2154ead2cf7SAlex Zinenko 2164ead2cf7SAlex Zinenko // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 2174ead2cf7SAlex Zinenko // "innermostForOp" to point to the last loop to be transformed to the kernel, 2184ead2cf7SAlex Zinenko // and to have (numBlockDims + numThreadDims) perfectly nested loops between 2194ead2cf7SAlex Zinenko // "rootForOp" and "innermostForOp". 2202bcd1927SMaheshRavishankar void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 2212bcd1927SMaheshRavishankar AffineForOp innermostForOp, 2224ead2cf7SAlex Zinenko unsigned numBlockDims, 2234ead2cf7SAlex Zinenko unsigned numThreadDims) { 2244ead2cf7SAlex Zinenko OpBuilder builder(rootForOp.getOperation()); 2254ead2cf7SAlex Zinenko // Prepare the grid and block sizes for the launch operation. If there is 2264ead2cf7SAlex Zinenko // no loop mapped to a specific dimension, use constant "1" as its size. 227*a54f4eaeSMogball Value constOne = 228*a54f4eaeSMogball (numBlockDims < 3 || numThreadDims < 3) 229*a54f4eaeSMogball ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) 2304ead2cf7SAlex Zinenko : nullptr; 2314ead2cf7SAlex Zinenko Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 2324ead2cf7SAlex Zinenko Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 2334ead2cf7SAlex Zinenko Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 2344ead2cf7SAlex Zinenko Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 2354ead2cf7SAlex Zinenko Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 2364ead2cf7SAlex Zinenko Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 2374ead2cf7SAlex Zinenko 2384ead2cf7SAlex Zinenko // Create a launch op and move the body region of the innermost loop to the 2394ead2cf7SAlex Zinenko // launch op. 2404ead2cf7SAlex Zinenko auto launchOp = builder.create<gpu::LaunchOp>( 2414ead2cf7SAlex Zinenko rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 2424ead2cf7SAlex Zinenko blockSizeY, blockSizeZ); 2434ead2cf7SAlex Zinenko 2444ead2cf7SAlex Zinenko // Replace the loop terminator (loops contain only a single block) with the 2454ead2cf7SAlex Zinenko // gpu terminator and move the operations from the loop body block to the gpu 2464ead2cf7SAlex Zinenko // launch body block. Do not move the entire block because of the difference 2474ead2cf7SAlex Zinenko // in block arguments. 2484ead2cf7SAlex Zinenko Operation &terminator = innermostForOp.getBody()->back(); 2494ead2cf7SAlex Zinenko Location terminatorLoc = terminator.getLoc(); 2504ead2cf7SAlex Zinenko terminator.erase(); 2514ead2cf7SAlex Zinenko builder.setInsertionPointToEnd(innermostForOp.getBody()); 2524ead2cf7SAlex Zinenko builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None); 2534ead2cf7SAlex Zinenko launchOp.body().front().getOperations().splice( 2544ead2cf7SAlex Zinenko launchOp.body().front().begin(), 2554ead2cf7SAlex Zinenko innermostForOp.getBody()->getOperations()); 2564ead2cf7SAlex Zinenko 2574ead2cf7SAlex Zinenko // Remap the loop iterators to use block/thread identifiers instead. Loops 2584ead2cf7SAlex Zinenko // may iterate from LB with step S whereas GPU thread/block ids always iterate 2594ead2cf7SAlex Zinenko // from 0 to N with step 1. Therefore, loop induction variables are replaced 2604ead2cf7SAlex Zinenko // with (gpu-thread/block-id * S) + LB. 2614ead2cf7SAlex Zinenko builder.setInsertionPointToStart(&launchOp.body().front()); 2624ead2cf7SAlex Zinenko auto lbArgumentIt = lbs.begin(); 2634ead2cf7SAlex Zinenko auto stepArgumentIt = steps.begin(); 2644ead2cf7SAlex Zinenko for (auto en : llvm::enumerate(ivs)) { 2654ead2cf7SAlex Zinenko Value id = 2664ead2cf7SAlex Zinenko en.index() < numBlockDims 2674ead2cf7SAlex Zinenko ? getDim3Value(launchOp.getBlockIds(), en.index()) 2684ead2cf7SAlex Zinenko : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 2694ead2cf7SAlex Zinenko Value step = steps[en.index()]; 2704ead2cf7SAlex Zinenko if (!isConstantOne(step)) 271*a54f4eaeSMogball id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id); 2724ead2cf7SAlex Zinenko 2734ead2cf7SAlex Zinenko Value ivReplacement = 274*a54f4eaeSMogball builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 2754ead2cf7SAlex Zinenko en.value().replaceAllUsesWith(ivReplacement); 2764ead2cf7SAlex Zinenko std::advance(lbArgumentIt, 1); 2774ead2cf7SAlex Zinenko std::advance(stepArgumentIt, 1); 2784ead2cf7SAlex Zinenko } 2794ead2cf7SAlex Zinenko 2804ead2cf7SAlex Zinenko // We are done and can erase the original outermost loop. 2814ead2cf7SAlex Zinenko rootForOp.erase(); 2824ead2cf7SAlex Zinenko } 2834ead2cf7SAlex Zinenko 2844ead2cf7SAlex Zinenko // Generic loop to GPU kernel conversion function. 2852bcd1927SMaheshRavishankar static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 2864ead2cf7SAlex Zinenko unsigned numBlockDims, 2874ead2cf7SAlex Zinenko unsigned numThreadDims) { 2882bcd1927SMaheshRavishankar if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 2894ead2cf7SAlex Zinenko return failure(); 2904ead2cf7SAlex Zinenko 2912bcd1927SMaheshRavishankar AffineLoopToGpuConverter converter; 2924ead2cf7SAlex Zinenko auto maybeInnerLoop = 2934ead2cf7SAlex Zinenko converter.collectBounds(forOp, numBlockDims + numThreadDims); 2944ead2cf7SAlex Zinenko if (!maybeInnerLoop) 2954ead2cf7SAlex Zinenko return failure(); 2964ead2cf7SAlex Zinenko converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 2974ead2cf7SAlex Zinenko 2984ead2cf7SAlex Zinenko return success(); 2994ead2cf7SAlex Zinenko } 3004ead2cf7SAlex Zinenko 3014ead2cf7SAlex Zinenko LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 3024ead2cf7SAlex Zinenko unsigned numBlockDims, 3034ead2cf7SAlex Zinenko unsigned numThreadDims) { 3042bcd1927SMaheshRavishankar return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 3054ead2cf7SAlex Zinenko } 3064ead2cf7SAlex Zinenko 3074ead2cf7SAlex Zinenko namespace { 3084ead2cf7SAlex Zinenko struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 3094ead2cf7SAlex Zinenko using OpRewritePattern<ParallelOp>::OpRewritePattern; 3104ead2cf7SAlex Zinenko 3114ead2cf7SAlex Zinenko LogicalResult matchAndRewrite(ParallelOp parallelOp, 3124ead2cf7SAlex Zinenko PatternRewriter &rewriter) const override; 3134ead2cf7SAlex Zinenko }; 3144ead2cf7SAlex Zinenko } // namespace 3154ead2cf7SAlex Zinenko 3164ead2cf7SAlex Zinenko /// Tries to derive a static upper bound from the defining operation of 3174ead2cf7SAlex Zinenko /// `upperBound`. 3184ead2cf7SAlex Zinenko static Value deriveStaticUpperBound(Value upperBound, 3194ead2cf7SAlex Zinenko PatternRewriter &rewriter) { 320*a54f4eaeSMogball if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) { 3214ead2cf7SAlex Zinenko return op; 3224ead2cf7SAlex Zinenko } 3234ead2cf7SAlex Zinenko 3244ead2cf7SAlex Zinenko if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 3254ead2cf7SAlex Zinenko for (const AffineExpr &result : minOp.map().getResults()) { 3264ead2cf7SAlex Zinenko if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) { 327*a54f4eaeSMogball return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), 3284ead2cf7SAlex Zinenko constExpr.getValue()); 3294ead2cf7SAlex Zinenko } 3304ead2cf7SAlex Zinenko } 3314ead2cf7SAlex Zinenko } 3324ead2cf7SAlex Zinenko 333*a54f4eaeSMogball if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) { 334*a54f4eaeSMogball if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>( 3354ead2cf7SAlex Zinenko deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 3364ead2cf7SAlex Zinenko .getDefiningOp())) 337*a54f4eaeSMogball if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>( 3384ead2cf7SAlex Zinenko deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 3394ead2cf7SAlex Zinenko .getDefiningOp())) { 3404ead2cf7SAlex Zinenko // Assumptions about the upper bound of minimum computations no longer 3414ead2cf7SAlex Zinenko // work if multiplied by a negative value, so abort in this case. 342*a54f4eaeSMogball if (lhs.value() < 0 || rhs.value() < 0) 3434ead2cf7SAlex Zinenko return {}; 3444ead2cf7SAlex Zinenko 345*a54f4eaeSMogball return rewriter.create<arith::ConstantIndexOp>( 346*a54f4eaeSMogball multiplyOp.getLoc(), lhs.value() * rhs.value()); 3474ead2cf7SAlex Zinenko } 3484ead2cf7SAlex Zinenko } 3494ead2cf7SAlex Zinenko 3504ead2cf7SAlex Zinenko return {}; 3514ead2cf7SAlex Zinenko } 3524ead2cf7SAlex Zinenko 3534ead2cf7SAlex Zinenko static bool isMappedToProcessor(gpu::Processor processor) { 3544ead2cf7SAlex Zinenko return processor != gpu::Processor::Sequential; 3554ead2cf7SAlex Zinenko } 3564ead2cf7SAlex Zinenko 3574ead2cf7SAlex Zinenko static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 3584ead2cf7SAlex Zinenko switch (processor) { 3594ead2cf7SAlex Zinenko case gpu::Processor::BlockX: 3604ead2cf7SAlex Zinenko return 0; 3614ead2cf7SAlex Zinenko case gpu::Processor::BlockY: 3624ead2cf7SAlex Zinenko return 1; 3634ead2cf7SAlex Zinenko case gpu::Processor::BlockZ: 3644ead2cf7SAlex Zinenko return 2; 3654ead2cf7SAlex Zinenko case gpu::Processor::ThreadX: 3664ead2cf7SAlex Zinenko return 3; 3674ead2cf7SAlex Zinenko case gpu::Processor::ThreadY: 3684ead2cf7SAlex Zinenko return 4; 3694ead2cf7SAlex Zinenko case gpu::Processor::ThreadZ: 3704ead2cf7SAlex Zinenko return 5; 3714ead2cf7SAlex Zinenko default:; 3724ead2cf7SAlex Zinenko } 3734ead2cf7SAlex Zinenko llvm_unreachable( 3744ead2cf7SAlex Zinenko "invalid processor type while retrieving launch op argument number"); 3754ead2cf7SAlex Zinenko } 3764ead2cf7SAlex Zinenko 3774ead2cf7SAlex Zinenko /// Modifies the current transformation state to capture the effect of the given 3784ead2cf7SAlex Zinenko /// `scf.parallel` operation on index substitutions and the operations to be 3794ead2cf7SAlex Zinenko /// inserted. 3804ead2cf7SAlex Zinenko /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 3814ead2cf7SAlex Zinenko /// this function will 3824ead2cf7SAlex Zinenko /// - compute the loop index based on the hardware id and affine map from the 3834ead2cf7SAlex Zinenko /// mapping and update `cloningMap` to substitute all uses. 3844ead2cf7SAlex Zinenko /// - derive a new upper bound for the hardware id and augment the provided 3854ead2cf7SAlex Zinenko /// `gpu.launch operation` accordingly. 3864ead2cf7SAlex Zinenko /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 3874ead2cf7SAlex Zinenko /// and update the rewriter to insert into the conditional's body. 3884ead2cf7SAlex Zinenko /// If the dimension is mapped to sequential, 3894ead2cf7SAlex Zinenko /// - insert a for loop into the body and update the rewriter to insert into 3904ead2cf7SAlex Zinenko /// the for loop's body. 3914ead2cf7SAlex Zinenko /// - update the `cloningMap` to replace uses of the index with the index of 3924ead2cf7SAlex Zinenko /// the new for loop. 3934ead2cf7SAlex Zinenko /// In either case, 3944ead2cf7SAlex Zinenko /// - append the instructions from the loops body to worklist, in reverse order. 3954ead2cf7SAlex Zinenko /// To note the end of the current scope in case a loop or conditional was 3964ead2cf7SAlex Zinenko /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 3974ead2cf7SAlex Zinenko /// worklist. This signals the processor of the worklist to pop the rewriter 3984ead2cf7SAlex Zinenko /// one scope-level up. 3994ead2cf7SAlex Zinenko static LogicalResult processParallelLoop( 4004ead2cf7SAlex Zinenko ParallelOp parallelOp, gpu::LaunchOp launchOp, 4014ead2cf7SAlex Zinenko BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, 4024ead2cf7SAlex Zinenko DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 4039db53a18SRiver Riddle // TODO: Verify that this is a valid GPU mapping. 4044ead2cf7SAlex Zinenko // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 4054ead2cf7SAlex Zinenko ArrayAttr mapping = 4060bf4a82aSChristian Sigg parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 4074ead2cf7SAlex Zinenko 4089db53a18SRiver Riddle // TODO: Support reductions. 4094ead2cf7SAlex Zinenko if (!mapping || parallelOp.getNumResults() != 0) 4104ead2cf7SAlex Zinenko return failure(); 4114ead2cf7SAlex Zinenko 4124ead2cf7SAlex Zinenko Location loc = parallelOp.getLoc(); 4134ead2cf7SAlex Zinenko 4144ead2cf7SAlex Zinenko auto launchIndependent = [&launchOp](Value val) { 4150bf4a82aSChristian Sigg return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 4164ead2cf7SAlex Zinenko }; 4174ead2cf7SAlex Zinenko 4184ead2cf7SAlex Zinenko auto ensureLaunchIndependent = [&rewriter, 4194ead2cf7SAlex Zinenko launchIndependent](Value val) -> Value { 4204ead2cf7SAlex Zinenko if (launchIndependent(val)) 4214ead2cf7SAlex Zinenko return val; 422*a54f4eaeSMogball if (auto constOp = val.getDefiningOp<arith::ConstantOp>()) 423*a54f4eaeSMogball return rewriter.create<arith::ConstantOp>(constOp.getLoc(), 424*a54f4eaeSMogball constOp.value()); 4254ead2cf7SAlex Zinenko return {}; 4264ead2cf7SAlex Zinenko }; 4274ead2cf7SAlex Zinenko 4284ead2cf7SAlex Zinenko for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(), 4294ead2cf7SAlex Zinenko parallelOp.lowerBound(), parallelOp.upperBound(), 4304ead2cf7SAlex Zinenko parallelOp.step())) { 4314ead2cf7SAlex Zinenko Attribute mappingAttribute; 4324ead2cf7SAlex Zinenko Value iv, lowerBound, upperBound, step; 4334ead2cf7SAlex Zinenko std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 4344ead2cf7SAlex Zinenko auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>(); 4354ead2cf7SAlex Zinenko if (!annotation) 4364ead2cf7SAlex Zinenko return parallelOp.emitOpError() 4374ead2cf7SAlex Zinenko << "expected mapping attribute for lowering to GPU"; 4384ead2cf7SAlex Zinenko Value newIndex; 4394ead2cf7SAlex Zinenko gpu::Processor processor = gpu::getProcessor(annotation); 4404ead2cf7SAlex Zinenko 4414ead2cf7SAlex Zinenko if (isMappedToProcessor(processor)) { 4424ead2cf7SAlex Zinenko // Use the corresponding thread/grid index as replacement for the loop iv. 443e2b71610SRahul Joshi Value operand = 444e2b71610SRahul Joshi launchOp.body().getArgument(getLaunchOpArgumentNum(processor)); 4454ead2cf7SAlex Zinenko // Take the indexmap and add the lower bound and step computations in. 4464ead2cf7SAlex Zinenko // This computes operand * step + lowerBound. 4474ead2cf7SAlex Zinenko // Use an affine map here so that it composes nicely with the provided 4484ead2cf7SAlex Zinenko // annotation. 4494ead2cf7SAlex Zinenko AffineMap lowerAndStep = AffineMap::get( 4504ead2cf7SAlex Zinenko 1, 2, 4514ead2cf7SAlex Zinenko rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 4524ead2cf7SAlex Zinenko rewriter.getAffineSymbolExpr(1)); 4534ead2cf7SAlex Zinenko newIndex = rewriter.create<AffineApplyOp>( 4544ead2cf7SAlex Zinenko loc, annotation.map().getValue().compose(lowerAndStep), 4554ead2cf7SAlex Zinenko ValueRange{operand, step, lowerBound}); 4564ead2cf7SAlex Zinenko // If there was also a bound, insert that, too. 4579db53a18SRiver Riddle // TODO: Check that we do not assign bounds twice. 4584ead2cf7SAlex Zinenko if (annotation.bound().getValue()) { 4594ead2cf7SAlex Zinenko // We pass as the single operand to the bound-map the number of 4604ead2cf7SAlex Zinenko // iterations, which is (upperBound - lowerBound) ceilDiv step. To 4614ead2cf7SAlex Zinenko // support inner loops with dynamic upper bounds (as generated by e.g. 4624ead2cf7SAlex Zinenko // tiling), try to derive a max for the bounds. If the used bound for 4634ead2cf7SAlex Zinenko // the hardware id is imprecise, wrap the contained code into a 4644ead2cf7SAlex Zinenko // conditional. If the lower-bound is constant or defined before the 4654ead2cf7SAlex Zinenko // launch, we can use it in the launch bounds. Otherwise fail. 4664ead2cf7SAlex Zinenko if (!launchIndependent(lowerBound) && 467*a54f4eaeSMogball !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp())) 4684ead2cf7SAlex Zinenko return failure(); 4694ead2cf7SAlex Zinenko // The step must also be constant or defined outside of the loop nest. 4704ead2cf7SAlex Zinenko if (!launchIndependent(step) && 471*a54f4eaeSMogball !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp())) 4724ead2cf7SAlex Zinenko return failure(); 4734ead2cf7SAlex Zinenko // If the upper-bound is constant or defined before the launch, we can 4744ead2cf7SAlex Zinenko // use it in the launch bounds directly. Otherwise try derive a bound. 4754ead2cf7SAlex Zinenko bool boundIsPrecise = 4764ead2cf7SAlex Zinenko launchIndependent(upperBound) || 477*a54f4eaeSMogball isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp()); 4784ead2cf7SAlex Zinenko { 4794ead2cf7SAlex Zinenko PatternRewriter::InsertionGuard guard(rewriter); 4804ead2cf7SAlex Zinenko rewriter.setInsertionPoint(launchOp); 4814ead2cf7SAlex Zinenko if (!boundIsPrecise) { 4824ead2cf7SAlex Zinenko upperBound = deriveStaticUpperBound(upperBound, rewriter); 4834ead2cf7SAlex Zinenko if (!upperBound) { 4845da2423bSStephan Herhut return rewriter.notifyMatchFailure( 4855da2423bSStephan Herhut parallelOp, 4865da2423bSStephan Herhut "cannot derive loop-invariant upper bound for number of" 4875da2423bSStephan Herhut "iterations"); 4884ead2cf7SAlex Zinenko } 4894ead2cf7SAlex Zinenko } 4904ead2cf7SAlex Zinenko // Compute the number of iterations needed. We compute this as an 4914ead2cf7SAlex Zinenko // affine expression ceilDiv (upperBound - lowerBound) step. We use 4924ead2cf7SAlex Zinenko // affine.apply here so that it composes nicely with the provided map. 49372d5ac90STres Popp AffineMap stepMap = AffineMap::get( 49472d5ac90STres Popp 1, 2, 49572d5ac90STres Popp ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 49672d5ac90STres Popp .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 4974ead2cf7SAlex Zinenko Value launchBound = rewriter.create<AffineApplyOp>( 4984ead2cf7SAlex Zinenko loc, annotation.bound().getValue().compose(stepMap), 4994ead2cf7SAlex Zinenko ValueRange{ 5004ead2cf7SAlex Zinenko ensureLaunchIndependent( 5014ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(upperBound)), 5024ead2cf7SAlex Zinenko ensureLaunchIndependent( 5034ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(lowerBound)), 5044ead2cf7SAlex Zinenko ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 5054ead2cf7SAlex Zinenko // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 5064ead2cf7SAlex Zinenko // when this condition is relaxed. 5074ead2cf7SAlex Zinenko if (bounds.find(processor) != bounds.end()) { 5085da2423bSStephan Herhut return rewriter.notifyMatchFailure( 5095da2423bSStephan Herhut parallelOp, "cannot redefine the bound for processor " + 5105da2423bSStephan Herhut Twine(static_cast<int64_t>(processor))); 5114ead2cf7SAlex Zinenko } 5124ead2cf7SAlex Zinenko bounds[processor] = launchBound; 5134ead2cf7SAlex Zinenko } 5144ead2cf7SAlex Zinenko if (!boundIsPrecise) { 5154ead2cf7SAlex Zinenko // We are using an approximation, create a surrounding conditional. 5164ead2cf7SAlex Zinenko Value originalBound = std::get<3>(config); 517*a54f4eaeSMogball arith::CmpIOp pred = rewriter.create<arith::CmpIOp>( 518*a54f4eaeSMogball loc, arith::CmpIPredicate::slt, newIndex, 5194ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(originalBound)); 5204ead2cf7SAlex Zinenko scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 5214ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(&ifOp.thenRegion().front()); 5224ead2cf7SAlex Zinenko // Put a sentinel into the worklist so we know when to pop out of the 5234ead2cf7SAlex Zinenko // if body again. We use the launchOp here, as that cannot be part of 5244ead2cf7SAlex Zinenko // the bodies instruction. 5254ead2cf7SAlex Zinenko worklist.push_back(launchOp.getOperation()); 5264ead2cf7SAlex Zinenko } 5274ead2cf7SAlex Zinenko } 5284ead2cf7SAlex Zinenko } else { 5294ead2cf7SAlex Zinenko // Create a sequential for loop. 5304ead2cf7SAlex Zinenko auto loopOp = rewriter.create<scf::ForOp>( 5314ead2cf7SAlex Zinenko loc, cloningMap.lookupOrDefault(lowerBound), 5324ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(upperBound), 5334ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(step)); 5344ead2cf7SAlex Zinenko newIndex = loopOp.getInductionVar(); 5354ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(loopOp.getBody()); 5364ead2cf7SAlex Zinenko // Put a sentinel into the worklist so we know when to pop out of the loop 5374ead2cf7SAlex Zinenko // body again. We use the launchOp here, as that cannot be part of the 5384ead2cf7SAlex Zinenko // bodies instruction. 5394ead2cf7SAlex Zinenko worklist.push_back(launchOp.getOperation()); 5404ead2cf7SAlex Zinenko } 5414ead2cf7SAlex Zinenko cloningMap.map(iv, newIndex); 5424ead2cf7SAlex Zinenko } 543396e7f45SArtur Bialas 544396e7f45SArtur Bialas // Propagate custom user defined optional attributes, that can be used at 545396e7f45SArtur Bialas // later stage, such as extension data for GPU kernel dispatch 54656774bddSMarius Brehler for (const auto &namedAttr : parallelOp->getAttrs()) { 547396e7f45SArtur Bialas if (namedAttr.first == gpu::getMappingAttrName() || 548396e7f45SArtur Bialas namedAttr.first == ParallelOp::getOperandSegmentSizeAttr()) 549396e7f45SArtur Bialas continue; 5501ffc1aaaSChristian Sigg launchOp->setAttr(namedAttr.first, namedAttr.second); 551396e7f45SArtur Bialas } 552396e7f45SArtur Bialas 5534ead2cf7SAlex Zinenko Block *body = parallelOp.getBody(); 5544ead2cf7SAlex Zinenko worklist.reserve(worklist.size() + body->getOperations().size()); 5554ead2cf7SAlex Zinenko for (Operation &op : llvm::reverse(body->without_terminator())) 5564ead2cf7SAlex Zinenko worklist.push_back(&op); 5574ead2cf7SAlex Zinenko return success(); 5584ead2cf7SAlex Zinenko } 5594ead2cf7SAlex Zinenko 5604ead2cf7SAlex Zinenko /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 5614ead2cf7SAlex Zinenko /// operation. 5624ead2cf7SAlex Zinenko /// 5634ead2cf7SAlex Zinenko /// This essentially transforms a loop nest into a corresponding SIMT function. 5644ead2cf7SAlex Zinenko /// The conversion is driven by mapping annotations on the `scf.parallel` 5654ead2cf7SAlex Zinenko /// operations. The mapping is provided via a `DictionaryAttribute` named 5664ead2cf7SAlex Zinenko /// `mapping`, which has three entries: 5674ead2cf7SAlex Zinenko /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 5684ead2cf7SAlex Zinenko /// thread dimensions and 6 is sequential. 5694ead2cf7SAlex Zinenko /// - map : An affine map that is used to pre-process hardware ids before 5704ead2cf7SAlex Zinenko /// substitution. 5714ead2cf7SAlex Zinenko /// - bound : An affine map that is used to compute the bound of the hardware 5724ead2cf7SAlex Zinenko /// id based on an upper bound of the number of iterations. 5734ead2cf7SAlex Zinenko /// If the `scf.parallel` contains nested `scf.parallel` operations, those 5744ead2cf7SAlex Zinenko /// need to be annotated, as well. Structurally, the transformation works by 5754ead2cf7SAlex Zinenko /// splicing all operations from nested `scf.parallel` operations into a single 5764ead2cf7SAlex Zinenko /// sequence. Indices mapped to hardware ids are substituted with those ids, 5774ead2cf7SAlex Zinenko /// wheras sequential mappings result in a sequential for-loop. To have more 5784ead2cf7SAlex Zinenko /// flexibility when mapping code to hardware ids, the transform supports two 5794ead2cf7SAlex Zinenko /// affine maps. The first `map` is used to compute the actual index for 5804ead2cf7SAlex Zinenko /// substitution from the hardware id. The second `bound` is used to compute the 5814ead2cf7SAlex Zinenko /// launch dimension for the hardware id from the number of iterations the 5824ead2cf7SAlex Zinenko /// mapped loop is performing. Note that the number of iterations might be 5834ead2cf7SAlex Zinenko /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 5844ead2cf7SAlex Zinenko /// the hardware id might iterate over additional indices. The transformation 5854ead2cf7SAlex Zinenko /// caters for this by predicating the created sequence of instructions on 5864ead2cf7SAlex Zinenko /// the actual loop bound. This only works if an static upper bound for the 5874ead2cf7SAlex Zinenko /// dynamic loop bound can be derived, currently via analyzing `affine.min` 5884ead2cf7SAlex Zinenko /// operations. 5894ead2cf7SAlex Zinenko LogicalResult 5904ead2cf7SAlex Zinenko ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 5914ead2cf7SAlex Zinenko PatternRewriter &rewriter) const { 592ec03bbe8SVladislav Vinogradov // Mark the operation as visited for recursive legality check. 593ec03bbe8SVladislav Vinogradov parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr()); 594ec03bbe8SVladislav Vinogradov 5955da2423bSStephan Herhut // We can only transform starting at the outer-most loop. Launches inside of 5965da2423bSStephan Herhut // parallel loops are not supported. 5970bf4a82aSChristian Sigg if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 5985da2423bSStephan Herhut return failure(); 5994ead2cf7SAlex Zinenko // Create a launch operation. We start with bound one for all grid/block 6004ead2cf7SAlex Zinenko // sizes. Those will be refined later as we discover them from mappings. 6014ead2cf7SAlex Zinenko Location loc = parallelOp.getLoc(); 602*a54f4eaeSMogball Value constantOne = 603*a54f4eaeSMogball rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1); 6044ead2cf7SAlex Zinenko gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 6054ead2cf7SAlex Zinenko parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 6064ead2cf7SAlex Zinenko constantOne, constantOne); 6074ead2cf7SAlex Zinenko rewriter.setInsertionPointToEnd(&launchOp.body().front()); 6084ead2cf7SAlex Zinenko rewriter.create<gpu::TerminatorOp>(loc); 6094ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(&launchOp.body().front()); 6104ead2cf7SAlex Zinenko 6114ead2cf7SAlex Zinenko BlockAndValueMapping cloningMap; 6124ead2cf7SAlex Zinenko llvm::DenseMap<gpu::Processor, Value> launchBounds; 6134ead2cf7SAlex Zinenko SmallVector<Operation *, 16> worklist; 6144ead2cf7SAlex Zinenko if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 6154ead2cf7SAlex Zinenko launchBounds, rewriter))) 6164ead2cf7SAlex Zinenko return failure(); 6174ead2cf7SAlex Zinenko 6184ead2cf7SAlex Zinenko // Whether we have seen any side-effects. Reset when leaving an inner scope. 6194ead2cf7SAlex Zinenko bool seenSideeffects = false; 6204ead2cf7SAlex Zinenko // Whether we have left a nesting scope (and hence are no longer innermost). 6214ead2cf7SAlex Zinenko bool leftNestingScope = false; 6224ead2cf7SAlex Zinenko while (!worklist.empty()) { 6234ead2cf7SAlex Zinenko Operation *op = worklist.pop_back_val(); 6244ead2cf7SAlex Zinenko // Now walk over the body and clone it. 6254ead2cf7SAlex Zinenko // TODO: This is only correct if there either is no further scf.parallel 6264ead2cf7SAlex Zinenko // nested or this code is side-effect free. Otherwise we might need 6274ead2cf7SAlex Zinenko // predication. We are overly conservative for now and only allow 6284ead2cf7SAlex Zinenko // side-effects in the innermost scope. 6294ead2cf7SAlex Zinenko if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 6304ead2cf7SAlex Zinenko // Before entering a nested scope, make sure there have been no 6314ead2cf7SAlex Zinenko // sideeffects until now. 6324ead2cf7SAlex Zinenko if (seenSideeffects) 6334ead2cf7SAlex Zinenko return failure(); 6344ead2cf7SAlex Zinenko // A nested scf.parallel needs insertion of code to compute indices. 6354ead2cf7SAlex Zinenko // Insert that now. This will also update the worklist with the loops 6364ead2cf7SAlex Zinenko // body. 6374ead2cf7SAlex Zinenko if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 6384ead2cf7SAlex Zinenko worklist, launchBounds, rewriter))) 6394ead2cf7SAlex Zinenko return failure(); 6404ead2cf7SAlex Zinenko } else if (op == launchOp.getOperation()) { 6414ead2cf7SAlex Zinenko // Found our sentinel value. We have finished the operations from one 6424ead2cf7SAlex Zinenko // nesting level, pop one level back up. 6434ead2cf7SAlex Zinenko auto parent = rewriter.getInsertionPoint()->getParentOp(); 6444ead2cf7SAlex Zinenko rewriter.setInsertionPointAfter(parent); 6454ead2cf7SAlex Zinenko leftNestingScope = true; 6464ead2cf7SAlex Zinenko seenSideeffects = false; 6474ead2cf7SAlex Zinenko } else { 6484ead2cf7SAlex Zinenko // Otherwise we copy it over. 6494ead2cf7SAlex Zinenko Operation *clone = rewriter.clone(*op, cloningMap); 6504ead2cf7SAlex Zinenko cloningMap.map(op->getResults(), clone->getResults()); 6514ead2cf7SAlex Zinenko // Check for side effects. 6524ead2cf7SAlex Zinenko // TODO: Handle region side effects properly. 6534ead2cf7SAlex Zinenko seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) || 6544ead2cf7SAlex Zinenko clone->getNumRegions() != 0; 6554ead2cf7SAlex Zinenko // If we are no longer in the innermost scope, sideeffects are disallowed. 6564ead2cf7SAlex Zinenko if (seenSideeffects && leftNestingScope) 6574ead2cf7SAlex Zinenko return failure(); 6584ead2cf7SAlex Zinenko } 6594ead2cf7SAlex Zinenko } 6604ead2cf7SAlex Zinenko 6614ead2cf7SAlex Zinenko // Now that we succeeded creating the launch operation, also update the 6624ead2cf7SAlex Zinenko // bounds. 6634ead2cf7SAlex Zinenko for (auto bound : launchBounds) 6644ead2cf7SAlex Zinenko launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 6654ead2cf7SAlex Zinenko std::get<1>(bound)); 6664ead2cf7SAlex Zinenko 6674ead2cf7SAlex Zinenko rewriter.eraseOp(parallelOp); 6684ead2cf7SAlex Zinenko return success(); 6694ead2cf7SAlex Zinenko } 6704ead2cf7SAlex Zinenko 671dc4e913bSChris Lattner void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) { 672dc4e913bSChris Lattner patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext()); 6734ead2cf7SAlex Zinenko } 6745da2423bSStephan Herhut 6755da2423bSStephan Herhut void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 676e2310704SJulian Gross target.addLegalDialect<memref::MemRefDialect>(); 6775da2423bSStephan Herhut target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 678ec03bbe8SVladislav Vinogradov return !parallelOp->hasAttr(gpu::getMappingAttrName()) || 679ec03bbe8SVladislav Vinogradov parallelOp->hasAttr(kVisitedAttrName); 680ec03bbe8SVladislav Vinogradov }); 681ec03bbe8SVladislav Vinogradov } 682ec03bbe8SVladislav Vinogradov 683ec03bbe8SVladislav Vinogradov void mlir::finalizeParallelLoopToGPUConversion(Operation *op) { 684ec03bbe8SVladislav Vinogradov op->walk([](scf::ParallelOp parallelOp) { 685ec03bbe8SVladislav Vinogradov parallelOp->removeAttr(kVisitedAttrName); 6865da2423bSStephan Herhut }); 6875da2423bSStephan Herhut } 688