14ead2cf7SAlex Zinenko //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 24ead2cf7SAlex Zinenko // 34ead2cf7SAlex Zinenko // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 44ead2cf7SAlex Zinenko // See https://llvm.org/LICENSE.txt for license information. 54ead2cf7SAlex Zinenko // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 64ead2cf7SAlex Zinenko // 74ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===// 84ead2cf7SAlex Zinenko // 94ead2cf7SAlex Zinenko // This implements a straightforward conversion of an loop nest into a GPU 104ead2cf7SAlex Zinenko // kernel. The caller is expected to guarantee that the conversion is correct 114ead2cf7SAlex Zinenko // or to further transform the kernel to ensure correctness. 124ead2cf7SAlex Zinenko // 134ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===// 144ead2cf7SAlex Zinenko 154ead2cf7SAlex Zinenko #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" 164ead2cf7SAlex Zinenko 174ead2cf7SAlex Zinenko #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" 184ead2cf7SAlex Zinenko #include "mlir/Dialect/Affine/IR/AffineOps.h" 194ead2cf7SAlex Zinenko #include "mlir/Dialect/GPU/GPUDialect.h" 204ead2cf7SAlex Zinenko #include "mlir/Dialect/GPU/ParallelLoopMapper.h" 214ead2cf7SAlex Zinenko #include "mlir/Dialect/SCF/SCF.h" 224ead2cf7SAlex Zinenko #include "mlir/Dialect/StandardOps/IR/Ops.h" 234ead2cf7SAlex Zinenko #include "mlir/IR/AffineExpr.h" 244ead2cf7SAlex Zinenko #include "mlir/IR/BlockAndValueMapping.h" 254ead2cf7SAlex Zinenko #include "mlir/IR/Builders.h" 264ead2cf7SAlex Zinenko #include "mlir/Pass/Pass.h" 274ead2cf7SAlex Zinenko #include "mlir/Transforms/DialectConversion.h" 284ead2cf7SAlex Zinenko #include "mlir/Transforms/LoopUtils.h" 294ead2cf7SAlex Zinenko #include "mlir/Transforms/Passes.h" 304ead2cf7SAlex Zinenko #include "mlir/Transforms/RegionUtils.h" 314ead2cf7SAlex Zinenko #include "llvm/ADT/Sequence.h" 324ead2cf7SAlex Zinenko #include "llvm/Support/Debug.h" 334ead2cf7SAlex Zinenko 344ead2cf7SAlex Zinenko #define DEBUG_TYPE "loops-to-gpu" 354ead2cf7SAlex Zinenko 364ead2cf7SAlex Zinenko using namespace mlir; 374ead2cf7SAlex Zinenko using namespace mlir::scf; 384ead2cf7SAlex Zinenko 394ead2cf7SAlex Zinenko // Extract an indexed value from KernelDim3. 404ead2cf7SAlex Zinenko static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 414ead2cf7SAlex Zinenko switch (pos) { 424ead2cf7SAlex Zinenko case 0: 434ead2cf7SAlex Zinenko return dim3.x; 444ead2cf7SAlex Zinenko case 1: 454ead2cf7SAlex Zinenko return dim3.y; 464ead2cf7SAlex Zinenko case 2: 474ead2cf7SAlex Zinenko return dim3.z; 484ead2cf7SAlex Zinenko default: 494ead2cf7SAlex Zinenko llvm_unreachable("dim3 position out of bounds"); 504ead2cf7SAlex Zinenko } 514ead2cf7SAlex Zinenko return nullptr; 524ead2cf7SAlex Zinenko } 534ead2cf7SAlex Zinenko 544ead2cf7SAlex Zinenko // Get the lower bound-related operands of a loop operation. 554ead2cf7SAlex Zinenko static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 564ead2cf7SAlex Zinenko return forOp.getLowerBoundOperands(); 574ead2cf7SAlex Zinenko } 584ead2cf7SAlex Zinenko 594ead2cf7SAlex Zinenko // Get the upper bound-related operands of a loop operation. 604ead2cf7SAlex Zinenko static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 614ead2cf7SAlex Zinenko return forOp.getUpperBoundOperands(); 624ead2cf7SAlex Zinenko } 634ead2cf7SAlex Zinenko 644ead2cf7SAlex Zinenko // Get a Value that corresponds to the loop step. If the step is an attribute, 654ead2cf7SAlex Zinenko // materialize a corresponding constant using builder. 664ead2cf7SAlex Zinenko static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 674ead2cf7SAlex Zinenko return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep()); 684ead2cf7SAlex Zinenko } 694ead2cf7SAlex Zinenko 704ead2cf7SAlex Zinenko // Get a Value for the loop lower bound. If the value requires computation, 714ead2cf7SAlex Zinenko // materialize the instructions using builder. 724ead2cf7SAlex Zinenko static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 734ead2cf7SAlex Zinenko return lowerAffineLowerBound(forOp, builder); 744ead2cf7SAlex Zinenko } 754ead2cf7SAlex Zinenko 764ead2cf7SAlex Zinenko // Get a Value for the loop upper bound. If the value requires computation, 774ead2cf7SAlex Zinenko // materialize the instructions using builder. 784ead2cf7SAlex Zinenko static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 794ead2cf7SAlex Zinenko return lowerAffineUpperBound(forOp, builder); 804ead2cf7SAlex Zinenko } 814ead2cf7SAlex Zinenko 824ead2cf7SAlex Zinenko // Check the structure of the loop nest: 834ead2cf7SAlex Zinenko // - there are enough loops to map to numDims; 844ead2cf7SAlex Zinenko // - the loops are perfectly nested; 854ead2cf7SAlex Zinenko // - the loop bounds can be computed above the outermost loop. 864ead2cf7SAlex Zinenko // This roughly corresponds to the "matcher" part of the pattern-based 874ead2cf7SAlex Zinenko // rewriting infrastructure. 882bcd1927SMaheshRavishankar static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, 892bcd1927SMaheshRavishankar unsigned numDims) { 904ead2cf7SAlex Zinenko Region &limit = forOp.region(); 914ead2cf7SAlex Zinenko for (unsigned i = 0, e = numDims; i < e; ++i) { 924ead2cf7SAlex Zinenko Operation *nested = &forOp.getBody()->front(); 934ead2cf7SAlex Zinenko if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) || 944ead2cf7SAlex Zinenko !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit)) 954ead2cf7SAlex Zinenko return forOp.emitError( 964ead2cf7SAlex Zinenko "loops with bounds depending on other mapped loops " 974ead2cf7SAlex Zinenko "are not supported"); 984ead2cf7SAlex Zinenko 994ead2cf7SAlex Zinenko // The innermost loop can have an arbitrary body, skip the perfect nesting 1004ead2cf7SAlex Zinenko // check for it. 1014ead2cf7SAlex Zinenko if (i == e - 1) 1024ead2cf7SAlex Zinenko break; 1034ead2cf7SAlex Zinenko 1044ead2cf7SAlex Zinenko auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end(); 1054ead2cf7SAlex Zinenko if (forOp.getBody()->empty() || std::next(begin, 2) != end) 1064ead2cf7SAlex Zinenko return forOp.emitError("expected perfectly nested loops in the body"); 1074ead2cf7SAlex Zinenko 1082bcd1927SMaheshRavishankar if (!(forOp = dyn_cast<AffineForOp>(nested))) 1094ead2cf7SAlex Zinenko return nested->emitError("expected a nested loop"); 1104ead2cf7SAlex Zinenko } 1114ead2cf7SAlex Zinenko return success(); 1124ead2cf7SAlex Zinenko } 1134ead2cf7SAlex Zinenko 1142bcd1927SMaheshRavishankar static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, 1152bcd1927SMaheshRavishankar unsigned numBlockDims, 1164ead2cf7SAlex Zinenko unsigned numThreadDims) { 1174ead2cf7SAlex Zinenko if (numBlockDims < 1 || numThreadDims < 1) { 1184ead2cf7SAlex Zinenko LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 1194ead2cf7SAlex Zinenko return success(); 1204ead2cf7SAlex Zinenko } 1214ead2cf7SAlex Zinenko 1224ead2cf7SAlex Zinenko if (numBlockDims > 3) { 1234ead2cf7SAlex Zinenko return forOp.emitError("cannot map to more than 3 block dimensions"); 1244ead2cf7SAlex Zinenko } 1254ead2cf7SAlex Zinenko if (numThreadDims > 3) { 1264ead2cf7SAlex Zinenko return forOp.emitError("cannot map to more than 3 thread dimensions"); 1274ead2cf7SAlex Zinenko } 1282bcd1927SMaheshRavishankar return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims); 1294ead2cf7SAlex Zinenko } 1304ead2cf7SAlex Zinenko 1314ead2cf7SAlex Zinenko namespace { 1324ead2cf7SAlex Zinenko // Helper structure that holds common state of the loop to GPU kernel 1334ead2cf7SAlex Zinenko // conversion. 1342bcd1927SMaheshRavishankar struct AffineLoopToGpuConverter { 1352bcd1927SMaheshRavishankar Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops); 1364ead2cf7SAlex Zinenko 1372bcd1927SMaheshRavishankar void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, 1382bcd1927SMaheshRavishankar unsigned numBlockDims, unsigned numThreadDims); 1394ead2cf7SAlex Zinenko 1404ead2cf7SAlex Zinenko // Ranges of the loops mapped to blocks or threads. 1414ead2cf7SAlex Zinenko SmallVector<Value, 6> dims; 1424ead2cf7SAlex Zinenko // Lower bounds of the loops mapped to blocks or threads. 1434ead2cf7SAlex Zinenko SmallVector<Value, 6> lbs; 1444ead2cf7SAlex Zinenko // Induction variables of the loops mapped to blocks or threads. 1454ead2cf7SAlex Zinenko SmallVector<Value, 6> ivs; 1464ead2cf7SAlex Zinenko // Steps of the loops mapped to blocks or threads. 1474ead2cf7SAlex Zinenko SmallVector<Value, 6> steps; 1484ead2cf7SAlex Zinenko }; 1494ead2cf7SAlex Zinenko } // namespace 1504ead2cf7SAlex Zinenko 1514ead2cf7SAlex Zinenko // Return true if the value is obviously a constant "one". 1524ead2cf7SAlex Zinenko static bool isConstantOne(Value value) { 1534ead2cf7SAlex Zinenko if (auto def = value.getDefiningOp<ConstantIndexOp>()) 1544ead2cf7SAlex Zinenko return def.getValue() == 1; 1554ead2cf7SAlex Zinenko return false; 1564ead2cf7SAlex Zinenko } 1574ead2cf7SAlex Zinenko 1584ead2cf7SAlex Zinenko // Collect ranges, bounds, steps and induction variables in preparation for 1594ead2cf7SAlex Zinenko // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 1604ead2cf7SAlex Zinenko // This may fail if the IR for computing loop bounds cannot be constructed, for 1614ead2cf7SAlex Zinenko // example if an affine loop uses semi-affine maps. Return the last loop to be 1624ead2cf7SAlex Zinenko // mapped on success, llvm::None on failure. 1632bcd1927SMaheshRavishankar Optional<AffineForOp> 1642bcd1927SMaheshRavishankar AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { 1654ead2cf7SAlex Zinenko OpBuilder builder(forOp.getOperation()); 1664ead2cf7SAlex Zinenko dims.reserve(numLoops); 1674ead2cf7SAlex Zinenko lbs.reserve(numLoops); 1684ead2cf7SAlex Zinenko ivs.reserve(numLoops); 1694ead2cf7SAlex Zinenko steps.reserve(numLoops); 1702bcd1927SMaheshRavishankar AffineForOp currentLoop = forOp; 1714ead2cf7SAlex Zinenko for (unsigned i = 0; i < numLoops; ++i) { 1724ead2cf7SAlex Zinenko Value lowerBound = getOrEmitLowerBound(currentLoop, builder); 1734ead2cf7SAlex Zinenko Value upperBound = getOrEmitUpperBound(currentLoop, builder); 1744ead2cf7SAlex Zinenko if (!lowerBound || !upperBound) { 1754ead2cf7SAlex Zinenko return llvm::None; 1764ead2cf7SAlex Zinenko } 1774ead2cf7SAlex Zinenko 1784ead2cf7SAlex Zinenko Value range = 1794ead2cf7SAlex Zinenko builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound); 1804ead2cf7SAlex Zinenko Value step = getOrCreateStep(currentLoop, builder); 1814ead2cf7SAlex Zinenko if (!isConstantOne(step)) 1824ead2cf7SAlex Zinenko range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step); 1834ead2cf7SAlex Zinenko dims.push_back(range); 1844ead2cf7SAlex Zinenko 1854ead2cf7SAlex Zinenko lbs.push_back(lowerBound); 1864ead2cf7SAlex Zinenko ivs.push_back(currentLoop.getInductionVar()); 1874ead2cf7SAlex Zinenko steps.push_back(step); 1884ead2cf7SAlex Zinenko 1894ead2cf7SAlex Zinenko if (i != numLoops - 1) 1902bcd1927SMaheshRavishankar currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front()); 1914ead2cf7SAlex Zinenko } 1924ead2cf7SAlex Zinenko return currentLoop; 1934ead2cf7SAlex Zinenko } 1944ead2cf7SAlex Zinenko 1954ead2cf7SAlex Zinenko // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 1964ead2cf7SAlex Zinenko // "innermostForOp" to point to the last loop to be transformed to the kernel, 1974ead2cf7SAlex Zinenko // and to have (numBlockDims + numThreadDims) perfectly nested loops between 1984ead2cf7SAlex Zinenko // "rootForOp" and "innermostForOp". 1992bcd1927SMaheshRavishankar void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, 2002bcd1927SMaheshRavishankar AffineForOp innermostForOp, 2014ead2cf7SAlex Zinenko unsigned numBlockDims, 2024ead2cf7SAlex Zinenko unsigned numThreadDims) { 2034ead2cf7SAlex Zinenko OpBuilder builder(rootForOp.getOperation()); 2044ead2cf7SAlex Zinenko // Prepare the grid and block sizes for the launch operation. If there is 2054ead2cf7SAlex Zinenko // no loop mapped to a specific dimension, use constant "1" as its size. 2064ead2cf7SAlex Zinenko Value constOne = (numBlockDims < 3 || numThreadDims < 3) 2074ead2cf7SAlex Zinenko ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1) 2084ead2cf7SAlex Zinenko : nullptr; 2094ead2cf7SAlex Zinenko Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; 2104ead2cf7SAlex Zinenko Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 2114ead2cf7SAlex Zinenko Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 2124ead2cf7SAlex Zinenko Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne; 2134ead2cf7SAlex Zinenko Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 2144ead2cf7SAlex Zinenko Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 2154ead2cf7SAlex Zinenko 2164ead2cf7SAlex Zinenko // Create a launch op and move the body region of the innermost loop to the 2174ead2cf7SAlex Zinenko // launch op. 2184ead2cf7SAlex Zinenko auto launchOp = builder.create<gpu::LaunchOp>( 2194ead2cf7SAlex Zinenko rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 2204ead2cf7SAlex Zinenko blockSizeY, blockSizeZ); 2214ead2cf7SAlex Zinenko 2224ead2cf7SAlex Zinenko // Replace the loop terminator (loops contain only a single block) with the 2234ead2cf7SAlex Zinenko // gpu terminator and move the operations from the loop body block to the gpu 2244ead2cf7SAlex Zinenko // launch body block. Do not move the entire block because of the difference 2254ead2cf7SAlex Zinenko // in block arguments. 2264ead2cf7SAlex Zinenko Operation &terminator = innermostForOp.getBody()->back(); 2274ead2cf7SAlex Zinenko Location terminatorLoc = terminator.getLoc(); 2284ead2cf7SAlex Zinenko terminator.erase(); 2294ead2cf7SAlex Zinenko builder.setInsertionPointToEnd(innermostForOp.getBody()); 2304ead2cf7SAlex Zinenko builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None); 2314ead2cf7SAlex Zinenko launchOp.body().front().getOperations().splice( 2324ead2cf7SAlex Zinenko launchOp.body().front().begin(), 2334ead2cf7SAlex Zinenko innermostForOp.getBody()->getOperations()); 2344ead2cf7SAlex Zinenko 2354ead2cf7SAlex Zinenko // Remap the loop iterators to use block/thread identifiers instead. Loops 2364ead2cf7SAlex Zinenko // may iterate from LB with step S whereas GPU thread/block ids always iterate 2374ead2cf7SAlex Zinenko // from 0 to N with step 1. Therefore, loop induction variables are replaced 2384ead2cf7SAlex Zinenko // with (gpu-thread/block-id * S) + LB. 2394ead2cf7SAlex Zinenko builder.setInsertionPointToStart(&launchOp.body().front()); 2404ead2cf7SAlex Zinenko auto lbArgumentIt = lbs.begin(); 2414ead2cf7SAlex Zinenko auto stepArgumentIt = steps.begin(); 2424ead2cf7SAlex Zinenko for (auto en : llvm::enumerate(ivs)) { 2434ead2cf7SAlex Zinenko Value id = 2444ead2cf7SAlex Zinenko en.index() < numBlockDims 2454ead2cf7SAlex Zinenko ? getDim3Value(launchOp.getBlockIds(), en.index()) 2464ead2cf7SAlex Zinenko : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 2474ead2cf7SAlex Zinenko Value step = steps[en.index()]; 2484ead2cf7SAlex Zinenko if (!isConstantOne(step)) 2494ead2cf7SAlex Zinenko id = builder.create<MulIOp>(rootForOp.getLoc(), step, id); 2504ead2cf7SAlex Zinenko 2514ead2cf7SAlex Zinenko Value ivReplacement = 2524ead2cf7SAlex Zinenko builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 2534ead2cf7SAlex Zinenko en.value().replaceAllUsesWith(ivReplacement); 2544ead2cf7SAlex Zinenko std::advance(lbArgumentIt, 1); 2554ead2cf7SAlex Zinenko std::advance(stepArgumentIt, 1); 2564ead2cf7SAlex Zinenko } 2574ead2cf7SAlex Zinenko 2584ead2cf7SAlex Zinenko // We are done and can erase the original outermost loop. 2594ead2cf7SAlex Zinenko rootForOp.erase(); 2604ead2cf7SAlex Zinenko } 2614ead2cf7SAlex Zinenko 2624ead2cf7SAlex Zinenko // Generic loop to GPU kernel conversion function. 2632bcd1927SMaheshRavishankar static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, 2644ead2cf7SAlex Zinenko unsigned numBlockDims, 2654ead2cf7SAlex Zinenko unsigned numThreadDims) { 2662bcd1927SMaheshRavishankar if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims))) 2674ead2cf7SAlex Zinenko return failure(); 2684ead2cf7SAlex Zinenko 2692bcd1927SMaheshRavishankar AffineLoopToGpuConverter converter; 2704ead2cf7SAlex Zinenko auto maybeInnerLoop = 2714ead2cf7SAlex Zinenko converter.collectBounds(forOp, numBlockDims + numThreadDims); 2724ead2cf7SAlex Zinenko if (!maybeInnerLoop) 2734ead2cf7SAlex Zinenko return failure(); 2744ead2cf7SAlex Zinenko converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 2754ead2cf7SAlex Zinenko 2764ead2cf7SAlex Zinenko return success(); 2774ead2cf7SAlex Zinenko } 2784ead2cf7SAlex Zinenko 2794ead2cf7SAlex Zinenko LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 2804ead2cf7SAlex Zinenko unsigned numBlockDims, 2814ead2cf7SAlex Zinenko unsigned numThreadDims) { 2822bcd1927SMaheshRavishankar return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 2834ead2cf7SAlex Zinenko } 2844ead2cf7SAlex Zinenko 2854ead2cf7SAlex Zinenko namespace { 2864ead2cf7SAlex Zinenko struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { 2874ead2cf7SAlex Zinenko using OpRewritePattern<ParallelOp>::OpRewritePattern; 2884ead2cf7SAlex Zinenko 2894ead2cf7SAlex Zinenko LogicalResult matchAndRewrite(ParallelOp parallelOp, 2904ead2cf7SAlex Zinenko PatternRewriter &rewriter) const override; 2914ead2cf7SAlex Zinenko }; 2924ead2cf7SAlex Zinenko } // namespace 2934ead2cf7SAlex Zinenko 2944ead2cf7SAlex Zinenko /// Tries to derive a static upper bound from the defining operation of 2954ead2cf7SAlex Zinenko /// `upperBound`. 2964ead2cf7SAlex Zinenko static Value deriveStaticUpperBound(Value upperBound, 2974ead2cf7SAlex Zinenko PatternRewriter &rewriter) { 2984ead2cf7SAlex Zinenko if (auto op = upperBound.getDefiningOp<ConstantIndexOp>()) { 2994ead2cf7SAlex Zinenko return op; 3004ead2cf7SAlex Zinenko } 3014ead2cf7SAlex Zinenko 3024ead2cf7SAlex Zinenko if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { 3034ead2cf7SAlex Zinenko for (const AffineExpr &result : minOp.map().getResults()) { 3044ead2cf7SAlex Zinenko if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) { 3054ead2cf7SAlex Zinenko return rewriter.create<ConstantIndexOp>(minOp.getLoc(), 3064ead2cf7SAlex Zinenko constExpr.getValue()); 3074ead2cf7SAlex Zinenko } 3084ead2cf7SAlex Zinenko } 3094ead2cf7SAlex Zinenko } 3104ead2cf7SAlex Zinenko 3114ead2cf7SAlex Zinenko if (auto multiplyOp = upperBound.getDefiningOp<MulIOp>()) { 3124ead2cf7SAlex Zinenko if (auto lhs = dyn_cast_or_null<ConstantIndexOp>( 3134ead2cf7SAlex Zinenko deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter) 3144ead2cf7SAlex Zinenko .getDefiningOp())) 3154ead2cf7SAlex Zinenko if (auto rhs = dyn_cast_or_null<ConstantIndexOp>( 3164ead2cf7SAlex Zinenko deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter) 3174ead2cf7SAlex Zinenko .getDefiningOp())) { 3184ead2cf7SAlex Zinenko // Assumptions about the upper bound of minimum computations no longer 3194ead2cf7SAlex Zinenko // work if multiplied by a negative value, so abort in this case. 3204ead2cf7SAlex Zinenko if (lhs.getValue() < 0 || rhs.getValue() < 0) 3214ead2cf7SAlex Zinenko return {}; 3224ead2cf7SAlex Zinenko 3234ead2cf7SAlex Zinenko return rewriter.create<ConstantIndexOp>( 3244ead2cf7SAlex Zinenko multiplyOp.getLoc(), lhs.getValue() * rhs.getValue()); 3254ead2cf7SAlex Zinenko } 3264ead2cf7SAlex Zinenko } 3274ead2cf7SAlex Zinenko 3284ead2cf7SAlex Zinenko return {}; 3294ead2cf7SAlex Zinenko } 3304ead2cf7SAlex Zinenko 3314ead2cf7SAlex Zinenko static bool isMappedToProcessor(gpu::Processor processor) { 3324ead2cf7SAlex Zinenko return processor != gpu::Processor::Sequential; 3334ead2cf7SAlex Zinenko } 3344ead2cf7SAlex Zinenko 3354ead2cf7SAlex Zinenko static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { 3364ead2cf7SAlex Zinenko switch (processor) { 3374ead2cf7SAlex Zinenko case gpu::Processor::BlockX: 3384ead2cf7SAlex Zinenko return 0; 3394ead2cf7SAlex Zinenko case gpu::Processor::BlockY: 3404ead2cf7SAlex Zinenko return 1; 3414ead2cf7SAlex Zinenko case gpu::Processor::BlockZ: 3424ead2cf7SAlex Zinenko return 2; 3434ead2cf7SAlex Zinenko case gpu::Processor::ThreadX: 3444ead2cf7SAlex Zinenko return 3; 3454ead2cf7SAlex Zinenko case gpu::Processor::ThreadY: 3464ead2cf7SAlex Zinenko return 4; 3474ead2cf7SAlex Zinenko case gpu::Processor::ThreadZ: 3484ead2cf7SAlex Zinenko return 5; 3494ead2cf7SAlex Zinenko default:; 3504ead2cf7SAlex Zinenko } 3514ead2cf7SAlex Zinenko llvm_unreachable( 3524ead2cf7SAlex Zinenko "invalid processor type while retrieving launch op argument number"); 3534ead2cf7SAlex Zinenko } 3544ead2cf7SAlex Zinenko 3554ead2cf7SAlex Zinenko /// Modifies the current transformation state to capture the effect of the given 3564ead2cf7SAlex Zinenko /// `scf.parallel` operation on index substitutions and the operations to be 3574ead2cf7SAlex Zinenko /// inserted. 3584ead2cf7SAlex Zinenko /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, 3594ead2cf7SAlex Zinenko /// this function will 3604ead2cf7SAlex Zinenko /// - compute the loop index based on the hardware id and affine map from the 3614ead2cf7SAlex Zinenko /// mapping and update `cloningMap` to substitute all uses. 3624ead2cf7SAlex Zinenko /// - derive a new upper bound for the hardware id and augment the provided 3634ead2cf7SAlex Zinenko /// `gpu.launch operation` accordingly. 3644ead2cf7SAlex Zinenko /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` 3654ead2cf7SAlex Zinenko /// and update the rewriter to insert into the conditional's body. 3664ead2cf7SAlex Zinenko /// If the dimension is mapped to sequential, 3674ead2cf7SAlex Zinenko /// - insert a for loop into the body and update the rewriter to insert into 3684ead2cf7SAlex Zinenko /// the for loop's body. 3694ead2cf7SAlex Zinenko /// - update the `cloningMap` to replace uses of the index with the index of 3704ead2cf7SAlex Zinenko /// the new for loop. 3714ead2cf7SAlex Zinenko /// In either case, 3724ead2cf7SAlex Zinenko /// - append the instructions from the loops body to worklist, in reverse order. 3734ead2cf7SAlex Zinenko /// To note the end of the current scope in case a loop or conditional was 3744ead2cf7SAlex Zinenko /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the 3754ead2cf7SAlex Zinenko /// worklist. This signals the processor of the worklist to pop the rewriter 3764ead2cf7SAlex Zinenko /// one scope-level up. 3774ead2cf7SAlex Zinenko static LogicalResult processParallelLoop( 3784ead2cf7SAlex Zinenko ParallelOp parallelOp, gpu::LaunchOp launchOp, 3794ead2cf7SAlex Zinenko BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, 3804ead2cf7SAlex Zinenko DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { 3819db53a18SRiver Riddle // TODO: Verify that this is a valid GPU mapping. 3824ead2cf7SAlex Zinenko // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential 3834ead2cf7SAlex Zinenko ArrayAttr mapping = 3840bf4a82aSChristian Sigg parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName()); 3854ead2cf7SAlex Zinenko 3869db53a18SRiver Riddle // TODO: Support reductions. 3874ead2cf7SAlex Zinenko if (!mapping || parallelOp.getNumResults() != 0) 3884ead2cf7SAlex Zinenko return failure(); 3894ead2cf7SAlex Zinenko 3904ead2cf7SAlex Zinenko Location loc = parallelOp.getLoc(); 3914ead2cf7SAlex Zinenko 3924ead2cf7SAlex Zinenko auto launchIndependent = [&launchOp](Value val) { 3930bf4a82aSChristian Sigg return val.getParentRegion()->isAncestor(launchOp->getParentRegion()); 3944ead2cf7SAlex Zinenko }; 3954ead2cf7SAlex Zinenko 3964ead2cf7SAlex Zinenko auto ensureLaunchIndependent = [&rewriter, 3974ead2cf7SAlex Zinenko launchIndependent](Value val) -> Value { 3984ead2cf7SAlex Zinenko if (launchIndependent(val)) 3994ead2cf7SAlex Zinenko return val; 4004ead2cf7SAlex Zinenko if (ConstantOp constOp = val.getDefiningOp<ConstantOp>()) 4014ead2cf7SAlex Zinenko return rewriter.create<ConstantOp>(constOp.getLoc(), constOp.getValue()); 4024ead2cf7SAlex Zinenko return {}; 4034ead2cf7SAlex Zinenko }; 4044ead2cf7SAlex Zinenko 4054ead2cf7SAlex Zinenko for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(), 4064ead2cf7SAlex Zinenko parallelOp.lowerBound(), parallelOp.upperBound(), 4074ead2cf7SAlex Zinenko parallelOp.step())) { 4084ead2cf7SAlex Zinenko Attribute mappingAttribute; 4094ead2cf7SAlex Zinenko Value iv, lowerBound, upperBound, step; 4104ead2cf7SAlex Zinenko std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; 4114ead2cf7SAlex Zinenko auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>(); 4124ead2cf7SAlex Zinenko if (!annotation) 4134ead2cf7SAlex Zinenko return parallelOp.emitOpError() 4144ead2cf7SAlex Zinenko << "expected mapping attribute for lowering to GPU"; 4154ead2cf7SAlex Zinenko Value newIndex; 4164ead2cf7SAlex Zinenko gpu::Processor processor = gpu::getProcessor(annotation); 4174ead2cf7SAlex Zinenko 4184ead2cf7SAlex Zinenko if (isMappedToProcessor(processor)) { 4194ead2cf7SAlex Zinenko // Use the corresponding thread/grid index as replacement for the loop iv. 420e2b71610SRahul Joshi Value operand = 421e2b71610SRahul Joshi launchOp.body().getArgument(getLaunchOpArgumentNum(processor)); 4224ead2cf7SAlex Zinenko // Take the indexmap and add the lower bound and step computations in. 4234ead2cf7SAlex Zinenko // This computes operand * step + lowerBound. 4244ead2cf7SAlex Zinenko // Use an affine map here so that it composes nicely with the provided 4254ead2cf7SAlex Zinenko // annotation. 4264ead2cf7SAlex Zinenko AffineMap lowerAndStep = AffineMap::get( 4274ead2cf7SAlex Zinenko 1, 2, 4284ead2cf7SAlex Zinenko rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + 4294ead2cf7SAlex Zinenko rewriter.getAffineSymbolExpr(1)); 4304ead2cf7SAlex Zinenko newIndex = rewriter.create<AffineApplyOp>( 4314ead2cf7SAlex Zinenko loc, annotation.map().getValue().compose(lowerAndStep), 4324ead2cf7SAlex Zinenko ValueRange{operand, step, lowerBound}); 4334ead2cf7SAlex Zinenko // If there was also a bound, insert that, too. 4349db53a18SRiver Riddle // TODO: Check that we do not assign bounds twice. 4354ead2cf7SAlex Zinenko if (annotation.bound().getValue()) { 4364ead2cf7SAlex Zinenko // We pass as the single operand to the bound-map the number of 4374ead2cf7SAlex Zinenko // iterations, which is (upperBound - lowerBound) ceilDiv step. To 4384ead2cf7SAlex Zinenko // support inner loops with dynamic upper bounds (as generated by e.g. 4394ead2cf7SAlex Zinenko // tiling), try to derive a max for the bounds. If the used bound for 4404ead2cf7SAlex Zinenko // the hardware id is imprecise, wrap the contained code into a 4414ead2cf7SAlex Zinenko // conditional. If the lower-bound is constant or defined before the 4424ead2cf7SAlex Zinenko // launch, we can use it in the launch bounds. Otherwise fail. 4434ead2cf7SAlex Zinenko if (!launchIndependent(lowerBound) && 4444ead2cf7SAlex Zinenko !isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp())) 4454ead2cf7SAlex Zinenko return failure(); 4464ead2cf7SAlex Zinenko // The step must also be constant or defined outside of the loop nest. 4474ead2cf7SAlex Zinenko if (!launchIndependent(step) && 4484ead2cf7SAlex Zinenko !isa_and_nonnull<ConstantOp>(step.getDefiningOp())) 4494ead2cf7SAlex Zinenko return failure(); 4504ead2cf7SAlex Zinenko // If the upper-bound is constant or defined before the launch, we can 4514ead2cf7SAlex Zinenko // use it in the launch bounds directly. Otherwise try derive a bound. 4524ead2cf7SAlex Zinenko bool boundIsPrecise = 4534ead2cf7SAlex Zinenko launchIndependent(upperBound) || 4544ead2cf7SAlex Zinenko isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp()); 4554ead2cf7SAlex Zinenko { 4564ead2cf7SAlex Zinenko PatternRewriter::InsertionGuard guard(rewriter); 4574ead2cf7SAlex Zinenko rewriter.setInsertionPoint(launchOp); 4584ead2cf7SAlex Zinenko if (!boundIsPrecise) { 4594ead2cf7SAlex Zinenko upperBound = deriveStaticUpperBound(upperBound, rewriter); 4604ead2cf7SAlex Zinenko if (!upperBound) { 4615da2423bSStephan Herhut return rewriter.notifyMatchFailure( 4625da2423bSStephan Herhut parallelOp, 4635da2423bSStephan Herhut "cannot derive loop-invariant upper bound for number of" 4645da2423bSStephan Herhut "iterations"); 4654ead2cf7SAlex Zinenko } 4664ead2cf7SAlex Zinenko } 4674ead2cf7SAlex Zinenko // Compute the number of iterations needed. We compute this as an 4684ead2cf7SAlex Zinenko // affine expression ceilDiv (upperBound - lowerBound) step. We use 4694ead2cf7SAlex Zinenko // affine.apply here so that it composes nicely with the provided map. 47072d5ac90STres Popp AffineMap stepMap = AffineMap::get( 47172d5ac90STres Popp 1, 2, 47272d5ac90STres Popp ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) 47372d5ac90STres Popp .ceilDiv(rewriter.getAffineSymbolExpr(1)))); 4744ead2cf7SAlex Zinenko Value launchBound = rewriter.create<AffineApplyOp>( 4754ead2cf7SAlex Zinenko loc, annotation.bound().getValue().compose(stepMap), 4764ead2cf7SAlex Zinenko ValueRange{ 4774ead2cf7SAlex Zinenko ensureLaunchIndependent( 4784ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(upperBound)), 4794ead2cf7SAlex Zinenko ensureLaunchIndependent( 4804ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(lowerBound)), 4814ead2cf7SAlex Zinenko ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); 4824ead2cf7SAlex Zinenko // todo(herhut,ravishankarm): Update the behavior of setMappingAttr 4834ead2cf7SAlex Zinenko // when this condition is relaxed. 4844ead2cf7SAlex Zinenko if (bounds.find(processor) != bounds.end()) { 4855da2423bSStephan Herhut return rewriter.notifyMatchFailure( 4865da2423bSStephan Herhut parallelOp, "cannot redefine the bound for processor " + 4875da2423bSStephan Herhut Twine(static_cast<int64_t>(processor))); 4884ead2cf7SAlex Zinenko } 4894ead2cf7SAlex Zinenko bounds[processor] = launchBound; 4904ead2cf7SAlex Zinenko } 4914ead2cf7SAlex Zinenko if (!boundIsPrecise) { 4924ead2cf7SAlex Zinenko // We are using an approximation, create a surrounding conditional. 4934ead2cf7SAlex Zinenko Value originalBound = std::get<3>(config); 4944ead2cf7SAlex Zinenko CmpIOp pred = rewriter.create<CmpIOp>( 4954ead2cf7SAlex Zinenko loc, CmpIPredicate::slt, newIndex, 4964ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(originalBound)); 4974ead2cf7SAlex Zinenko scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); 4984ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(&ifOp.thenRegion().front()); 4994ead2cf7SAlex Zinenko // Put a sentinel into the worklist so we know when to pop out of the 5004ead2cf7SAlex Zinenko // if body again. We use the launchOp here, as that cannot be part of 5014ead2cf7SAlex Zinenko // the bodies instruction. 5024ead2cf7SAlex Zinenko worklist.push_back(launchOp.getOperation()); 5034ead2cf7SAlex Zinenko } 5044ead2cf7SAlex Zinenko } 5054ead2cf7SAlex Zinenko } else { 5064ead2cf7SAlex Zinenko // Create a sequential for loop. 5074ead2cf7SAlex Zinenko auto loopOp = rewriter.create<scf::ForOp>( 5084ead2cf7SAlex Zinenko loc, cloningMap.lookupOrDefault(lowerBound), 5094ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(upperBound), 5104ead2cf7SAlex Zinenko cloningMap.lookupOrDefault(step)); 5114ead2cf7SAlex Zinenko newIndex = loopOp.getInductionVar(); 5124ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(loopOp.getBody()); 5134ead2cf7SAlex Zinenko // Put a sentinel into the worklist so we know when to pop out of the loop 5144ead2cf7SAlex Zinenko // body again. We use the launchOp here, as that cannot be part of the 5154ead2cf7SAlex Zinenko // bodies instruction. 5164ead2cf7SAlex Zinenko worklist.push_back(launchOp.getOperation()); 5174ead2cf7SAlex Zinenko } 5184ead2cf7SAlex Zinenko cloningMap.map(iv, newIndex); 5194ead2cf7SAlex Zinenko } 520396e7f45SArtur Bialas 521396e7f45SArtur Bialas // Propagate custom user defined optional attributes, that can be used at 522396e7f45SArtur Bialas // later stage, such as extension data for GPU kernel dispatch 523396e7f45SArtur Bialas for (const auto &namedAttr : parallelOp.getAttrs()) { 524396e7f45SArtur Bialas if (namedAttr.first == gpu::getMappingAttrName() || 525396e7f45SArtur Bialas namedAttr.first == ParallelOp::getOperandSegmentSizeAttr()) 526396e7f45SArtur Bialas continue; 527*1ffc1aaaSChristian Sigg launchOp->setAttr(namedAttr.first, namedAttr.second); 528396e7f45SArtur Bialas } 529396e7f45SArtur Bialas 5304ead2cf7SAlex Zinenko Block *body = parallelOp.getBody(); 5314ead2cf7SAlex Zinenko worklist.reserve(worklist.size() + body->getOperations().size()); 5324ead2cf7SAlex Zinenko for (Operation &op : llvm::reverse(body->without_terminator())) 5334ead2cf7SAlex Zinenko worklist.push_back(&op); 5344ead2cf7SAlex Zinenko return success(); 5354ead2cf7SAlex Zinenko } 5364ead2cf7SAlex Zinenko 5374ead2cf7SAlex Zinenko /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` 5384ead2cf7SAlex Zinenko /// operation. 5394ead2cf7SAlex Zinenko /// 5404ead2cf7SAlex Zinenko /// This essentially transforms a loop nest into a corresponding SIMT function. 5414ead2cf7SAlex Zinenko /// The conversion is driven by mapping annotations on the `scf.parallel` 5424ead2cf7SAlex Zinenko /// operations. The mapping is provided via a `DictionaryAttribute` named 5434ead2cf7SAlex Zinenko /// `mapping`, which has three entries: 5444ead2cf7SAlex Zinenko /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are 5454ead2cf7SAlex Zinenko /// thread dimensions and 6 is sequential. 5464ead2cf7SAlex Zinenko /// - map : An affine map that is used to pre-process hardware ids before 5474ead2cf7SAlex Zinenko /// substitution. 5484ead2cf7SAlex Zinenko /// - bound : An affine map that is used to compute the bound of the hardware 5494ead2cf7SAlex Zinenko /// id based on an upper bound of the number of iterations. 5504ead2cf7SAlex Zinenko /// If the `scf.parallel` contains nested `scf.parallel` operations, those 5514ead2cf7SAlex Zinenko /// need to be annotated, as well. Structurally, the transformation works by 5524ead2cf7SAlex Zinenko /// splicing all operations from nested `scf.parallel` operations into a single 5534ead2cf7SAlex Zinenko /// sequence. Indices mapped to hardware ids are substituted with those ids, 5544ead2cf7SAlex Zinenko /// wheras sequential mappings result in a sequential for-loop. To have more 5554ead2cf7SAlex Zinenko /// flexibility when mapping code to hardware ids, the transform supports two 5564ead2cf7SAlex Zinenko /// affine maps. The first `map` is used to compute the actual index for 5574ead2cf7SAlex Zinenko /// substitution from the hardware id. The second `bound` is used to compute the 5584ead2cf7SAlex Zinenko /// launch dimension for the hardware id from the number of iterations the 5594ead2cf7SAlex Zinenko /// mapped loop is performing. Note that the number of iterations might be 5604ead2cf7SAlex Zinenko /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, 5614ead2cf7SAlex Zinenko /// the hardware id might iterate over additional indices. The transformation 5624ead2cf7SAlex Zinenko /// caters for this by predicating the created sequence of instructions on 5634ead2cf7SAlex Zinenko /// the actual loop bound. This only works if an static upper bound for the 5644ead2cf7SAlex Zinenko /// dynamic loop bound can be derived, currently via analyzing `affine.min` 5654ead2cf7SAlex Zinenko /// operations. 5664ead2cf7SAlex Zinenko LogicalResult 5674ead2cf7SAlex Zinenko ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, 5684ead2cf7SAlex Zinenko PatternRewriter &rewriter) const { 5695da2423bSStephan Herhut // We can only transform starting at the outer-most loop. Launches inside of 5705da2423bSStephan Herhut // parallel loops are not supported. 5710bf4a82aSChristian Sigg if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>()) 5725da2423bSStephan Herhut return failure(); 5734ead2cf7SAlex Zinenko // Create a launch operation. We start with bound one for all grid/block 5744ead2cf7SAlex Zinenko // sizes. Those will be refined later as we discover them from mappings. 5754ead2cf7SAlex Zinenko Location loc = parallelOp.getLoc(); 5764ead2cf7SAlex Zinenko Value constantOne = rewriter.create<ConstantIndexOp>(parallelOp.getLoc(), 1); 5774ead2cf7SAlex Zinenko gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( 5784ead2cf7SAlex Zinenko parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, 5794ead2cf7SAlex Zinenko constantOne, constantOne); 5804ead2cf7SAlex Zinenko rewriter.setInsertionPointToEnd(&launchOp.body().front()); 5814ead2cf7SAlex Zinenko rewriter.create<gpu::TerminatorOp>(loc); 5824ead2cf7SAlex Zinenko rewriter.setInsertionPointToStart(&launchOp.body().front()); 5834ead2cf7SAlex Zinenko 5844ead2cf7SAlex Zinenko BlockAndValueMapping cloningMap; 5854ead2cf7SAlex Zinenko llvm::DenseMap<gpu::Processor, Value> launchBounds; 5864ead2cf7SAlex Zinenko SmallVector<Operation *, 16> worklist; 5874ead2cf7SAlex Zinenko if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, 5884ead2cf7SAlex Zinenko launchBounds, rewriter))) 5894ead2cf7SAlex Zinenko return failure(); 5904ead2cf7SAlex Zinenko 5914ead2cf7SAlex Zinenko // Whether we have seen any side-effects. Reset when leaving an inner scope. 5924ead2cf7SAlex Zinenko bool seenSideeffects = false; 5934ead2cf7SAlex Zinenko // Whether we have left a nesting scope (and hence are no longer innermost). 5944ead2cf7SAlex Zinenko bool leftNestingScope = false; 5954ead2cf7SAlex Zinenko while (!worklist.empty()) { 5964ead2cf7SAlex Zinenko Operation *op = worklist.pop_back_val(); 5974ead2cf7SAlex Zinenko // Now walk over the body and clone it. 5984ead2cf7SAlex Zinenko // TODO: This is only correct if there either is no further scf.parallel 5994ead2cf7SAlex Zinenko // nested or this code is side-effect free. Otherwise we might need 6004ead2cf7SAlex Zinenko // predication. We are overly conservative for now and only allow 6014ead2cf7SAlex Zinenko // side-effects in the innermost scope. 6024ead2cf7SAlex Zinenko if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { 6034ead2cf7SAlex Zinenko // Before entering a nested scope, make sure there have been no 6044ead2cf7SAlex Zinenko // sideeffects until now. 6054ead2cf7SAlex Zinenko if (seenSideeffects) 6064ead2cf7SAlex Zinenko return failure(); 6074ead2cf7SAlex Zinenko // A nested scf.parallel needs insertion of code to compute indices. 6084ead2cf7SAlex Zinenko // Insert that now. This will also update the worklist with the loops 6094ead2cf7SAlex Zinenko // body. 6104ead2cf7SAlex Zinenko if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, 6114ead2cf7SAlex Zinenko worklist, launchBounds, rewriter))) 6124ead2cf7SAlex Zinenko return failure(); 6134ead2cf7SAlex Zinenko } else if (op == launchOp.getOperation()) { 6144ead2cf7SAlex Zinenko // Found our sentinel value. We have finished the operations from one 6154ead2cf7SAlex Zinenko // nesting level, pop one level back up. 6164ead2cf7SAlex Zinenko auto parent = rewriter.getInsertionPoint()->getParentOp(); 6174ead2cf7SAlex Zinenko rewriter.setInsertionPointAfter(parent); 6184ead2cf7SAlex Zinenko leftNestingScope = true; 6194ead2cf7SAlex Zinenko seenSideeffects = false; 6204ead2cf7SAlex Zinenko } else { 6214ead2cf7SAlex Zinenko // Otherwise we copy it over. 6224ead2cf7SAlex Zinenko Operation *clone = rewriter.clone(*op, cloningMap); 6234ead2cf7SAlex Zinenko cloningMap.map(op->getResults(), clone->getResults()); 6244ead2cf7SAlex Zinenko // Check for side effects. 6254ead2cf7SAlex Zinenko // TODO: Handle region side effects properly. 6264ead2cf7SAlex Zinenko seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) || 6274ead2cf7SAlex Zinenko clone->getNumRegions() != 0; 6284ead2cf7SAlex Zinenko // If we are no longer in the innermost scope, sideeffects are disallowed. 6294ead2cf7SAlex Zinenko if (seenSideeffects && leftNestingScope) 6304ead2cf7SAlex Zinenko return failure(); 6314ead2cf7SAlex Zinenko } 6324ead2cf7SAlex Zinenko } 6334ead2cf7SAlex Zinenko 6344ead2cf7SAlex Zinenko // Now that we succeeded creating the launch operation, also update the 6354ead2cf7SAlex Zinenko // bounds. 6364ead2cf7SAlex Zinenko for (auto bound : launchBounds) 6374ead2cf7SAlex Zinenko launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), 6384ead2cf7SAlex Zinenko std::get<1>(bound)); 6394ead2cf7SAlex Zinenko 6404ead2cf7SAlex Zinenko rewriter.eraseOp(parallelOp); 6414ead2cf7SAlex Zinenko return success(); 6424ead2cf7SAlex Zinenko } 6434ead2cf7SAlex Zinenko 6444ead2cf7SAlex Zinenko void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns, 6454ead2cf7SAlex Zinenko MLIRContext *ctx) { 6464ead2cf7SAlex Zinenko patterns.insert<ParallelToGpuLaunchLowering>(ctx); 6474ead2cf7SAlex Zinenko } 6485da2423bSStephan Herhut 6495da2423bSStephan Herhut void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { 6505da2423bSStephan Herhut target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) { 651*1ffc1aaaSChristian Sigg return !parallelOp->getAttr(gpu::getMappingAttrName()); 6525da2423bSStephan Herhut }); 6535da2423bSStephan Herhut } 654