1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the GPU dialect kernel outlining pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "PassDetail.h" 14 #include "mlir/Dialect/GPU/GPUDialect.h" 15 #include "mlir/Dialect/GPU/Passes.h" 16 #include "mlir/Dialect/GPU/Utils.h" 17 #include "mlir/Dialect/MemRef/IR/MemRef.h" 18 #include "mlir/Dialect/StandardOps/IR/Ops.h" 19 #include "mlir/IR/BlockAndValueMapping.h" 20 #include "mlir/IR/Builders.h" 21 #include "mlir/IR/SymbolTable.h" 22 #include "mlir/Support/LLVM.h" 23 #include "mlir/Transforms/RegionUtils.h" 24 25 using namespace mlir; 26 27 template <typename OpTy> 28 static void createForAllDimensions(OpBuilder &builder, Location loc, 29 SmallVectorImpl<Value> &values) { 30 for (StringRef dim : {"x", "y", "z"}) { 31 Value v = builder.create<OpTy>(loc, builder.getIndexType(), 32 builder.getStringAttr(dim)); 33 values.push_back(v); 34 } 35 } 36 37 /// Adds operations generating block/thread ids and grid/block dimensions at the 38 /// beginning of the `launchFuncOpBody` region. Add mapping from argument in 39 /// entry block of `launchOpBody`, to the corresponding result value of the 40 /// added operations. 41 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, 42 Region &launchOpBody, 43 BlockAndValueMapping &map) { 44 OpBuilder builder(loc->getContext()); 45 Block &firstBlock = launchOpBody.front(); 46 builder.setInsertionPointToStart(&launchFuncOpBody.front()); 47 SmallVector<Value, 12> indexOps; 48 createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps); 49 createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps); 50 createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps); 51 createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps); 52 // Replace the leading 12 function args with the respective thread/block index 53 // operations. Iterate backwards since args are erased and indices change. 54 for (auto indexOp : enumerate(indexOps)) 55 map.map(firstBlock.getArgument(indexOp.index()), indexOp.value()); 56 } 57 58 /// Identifies operations that are beneficial to sink into kernels. These 59 /// operations may not have side-effects, as otherwise sinking (and hence 60 /// duplicating them) is not legal. 61 static bool isSinkingBeneficiary(Operation *op) { 62 return isa<ConstantOp, memref::DimOp, SelectOp, CmpIOp>(op); 63 } 64 65 /// For a given operation `op`, computes whether it is beneficial to sink the 66 /// operation into the kernel. An operation can be sunk if doing so does not 67 /// introduce new kernel arguments. Whether a value is already available in the 68 /// kernel (and hence does not introduce new arguments) is checked by 69 /// querying `existingDependencies` and `availableValues`. 70 /// If an operand is not yet available, we recursively check whether it can be 71 /// made available by siking its defining op. 72 /// Operations that are indentified for sinking are added to `beneficiaryOps` in 73 /// the order they should appear in the kernel. Furthermore, `availableValues` 74 /// is updated with results that will be available after sinking the identified 75 /// ops. 76 static bool 77 extractBeneficiaryOps(Operation *op, 78 llvm::SetVector<Value> existingDependencies, 79 llvm::SetVector<Operation *> &beneficiaryOps, 80 llvm::SmallPtrSetImpl<Value> &availableValues) { 81 if (beneficiaryOps.count(op)) 82 return true; 83 84 if (!isSinkingBeneficiary(op)) 85 return false; 86 87 for (Value operand : op->getOperands()) { 88 // It is already visible in the kernel, keep going. 89 if (availableValues.count(operand)) 90 continue; 91 // Else check whether it can be made available via sinking or already is a 92 // dependency. 93 Operation *definingOp = operand.getDefiningOp(); 94 if ((!definingOp || 95 !extractBeneficiaryOps(definingOp, existingDependencies, 96 beneficiaryOps, availableValues)) && 97 !existingDependencies.count(operand)) 98 return false; 99 } 100 // We will sink the operation, mark its results as now available. 101 beneficiaryOps.insert(op); 102 for (Value result : op->getResults()) 103 availableValues.insert(result); 104 return true; 105 } 106 107 LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) { 108 Region &launchOpBody = launchOp.body(); 109 110 // Identify uses from values defined outside of the scope of the launch 111 // operation. 112 llvm::SetVector<Value> sinkCandidates; 113 getUsedValuesDefinedAbove(launchOpBody, sinkCandidates); 114 115 llvm::SetVector<Operation *> toBeSunk; 116 llvm::SmallPtrSet<Value, 4> availableValues; 117 for (Value operand : sinkCandidates) { 118 Operation *operandOp = operand.getDefiningOp(); 119 if (!operandOp) 120 continue; 121 extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues); 122 } 123 124 // Insert operations so that the defs get cloned before uses. 125 BlockAndValueMapping map; 126 OpBuilder builder(launchOpBody); 127 for (Operation *op : toBeSunk) { 128 Operation *clonedOp = builder.clone(*op, map); 129 // Only replace uses within the launch op. 130 for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults())) 131 replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair), 132 launchOp.body()); 133 } 134 return success(); 135 } 136 137 /// Outline the `gpu.launch` operation body into a kernel function. Replace 138 /// `gpu.terminator` operations by `gpu.return` in the generated function. 139 static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, 140 StringRef kernelFnName, 141 llvm::SetVector<Value> &operands) { 142 Location loc = launchOp.getLoc(); 143 // Create a builder with no insertion point, insertion will happen separately 144 // due to symbol table manipulation. 145 OpBuilder builder(launchOp.getContext()); 146 Region &launchOpBody = launchOp.body(); 147 148 // Identify uses from values defined outside of the scope of the launch 149 // operation. 150 getUsedValuesDefinedAbove(launchOpBody, operands); 151 152 // Create the gpu.func operation. 153 SmallVector<Type, 4> kernelOperandTypes; 154 kernelOperandTypes.reserve(operands.size()); 155 for (Value operand : operands) { 156 kernelOperandTypes.push_back(operand.getType()); 157 } 158 FunctionType type = 159 FunctionType::get(launchOp.getContext(), kernelOperandTypes, {}); 160 auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type); 161 outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), 162 builder.getUnitAttr()); 163 BlockAndValueMapping map; 164 165 // Map the arguments corresponding to the launch parameters like blockIdx, 166 // threadIdx, etc. 167 Region &outlinedFuncBody = outlinedFunc.body(); 168 injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map); 169 170 // Map arguments from gpu.launch region to the arguments of the gpu.func 171 // operation. 172 Block &entryBlock = outlinedFuncBody.front(); 173 for (auto operand : enumerate(operands)) 174 map.map(operand.value(), entryBlock.getArgument(operand.index())); 175 176 // Clone the region of the gpu.launch operation into the gpu.func operation. 177 // TODO: If cloneInto can be modified such that if a mapping for 178 // a block exists, that block will be used to clone operations into (at the 179 // end of the block), instead of creating a new block, this would be much 180 // cleaner. 181 launchOpBody.cloneInto(&outlinedFuncBody, map); 182 183 // Branch from entry of the gpu.func operation to the block that is cloned 184 // from the entry block of the gpu.launch operation. 185 Block &launchOpEntry = launchOpBody.front(); 186 Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry); 187 builder.setInsertionPointToEnd(&entryBlock); 188 builder.create<BranchOp>(loc, clonedLaunchOpEntry); 189 190 outlinedFunc.walk([](gpu::TerminatorOp op) { 191 OpBuilder replacer(op); 192 replacer.create<gpu::ReturnOp>(op.getLoc()); 193 op.erase(); 194 }); 195 return outlinedFunc; 196 } 197 198 gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp, 199 StringRef kernelFnName, 200 llvm::SmallVectorImpl<Value> &operands) { 201 DenseSet<Value> inputOperandSet; 202 inputOperandSet.insert(operands.begin(), operands.end()); 203 llvm::SetVector<Value> operandSet(operands.begin(), operands.end()); 204 auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet); 205 for (auto operand : operandSet) { 206 if (!inputOperandSet.count(operand)) 207 operands.push_back(operand); 208 } 209 return funcOp; 210 } 211 212 /// Replace `gpu.launch` operations with an `gpu.launch_func` operation 213 /// launching `kernelFunc`. The kernel func contains the body of the 214 /// `gpu.launch` with constant region arguments inlined. 215 static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, 216 gpu::GPUFuncOp kernelFunc, 217 ValueRange operands) { 218 OpBuilder builder(launchOp); 219 builder.create<gpu::LaunchFuncOp>( 220 launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), 221 launchOp.getBlockSizeOperandValues(), operands); 222 launchOp.erase(); 223 } 224 225 namespace { 226 /// Pass that moves the kernel of each LaunchOp into its separate nested module. 227 /// 228 /// This pass moves the kernel code of each LaunchOp into a function created 229 /// inside a nested module. It also creates an external function of the same 230 /// name in the parent module. 231 /// 232 /// The gpu.modules are intended to be compiled to a cubin blob independently in 233 /// a separate pass. The external functions can then be annotated with the 234 /// symbol of the cubin accessor function. 235 class GpuKernelOutliningPass 236 : public GpuKernelOutliningBase<GpuKernelOutliningPass> { 237 public: 238 void runOnOperation() override { 239 SymbolTable symbolTable(getOperation()); 240 bool modified = false; 241 for (auto func : getOperation().getOps<FuncOp>()) { 242 // Insert just after the function. 243 Block::iterator insertPt(func->getNextNode()); 244 auto funcWalkResult = func.walk([&](gpu::LaunchOp op) { 245 llvm::SetVector<Value> operands; 246 std::string kernelFnName = 247 Twine(op->getParentOfType<FuncOp>().getName(), "_kernel").str(); 248 249 // Pull in instructions that can be sunk 250 if (failed(sinkOperationsIntoLaunchOp(op))) 251 return WalkResult::interrupt(); 252 gpu::GPUFuncOp outlinedFunc = 253 outlineKernelFuncImpl(op, kernelFnName, operands); 254 255 // Create nested module and insert outlinedFunc. The module will 256 // originally get the same name as the function, but may be renamed on 257 // insertion into the parent module. 258 auto kernelModule = createKernelModule(outlinedFunc, symbolTable); 259 symbolTable.insert(kernelModule, insertPt); 260 261 // Potentially changes signature, pulling in constants. 262 convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef()); 263 modified = true; 264 return WalkResult::advance(); 265 }); 266 if (funcWalkResult.wasInterrupted()) 267 return signalPassFailure(); 268 } 269 270 // If any new module was inserted in this module, annotate this module as 271 // a container module. 272 if (modified) 273 getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(), 274 UnitAttr::get(&getContext())); 275 } 276 277 private: 278 /// Returns a gpu.module containing kernelFunc and all callees (recursive). 279 gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, 280 const SymbolTable &parentSymbolTable) { 281 // TODO: This code cannot use an OpBuilder because it must be inserted into 282 // a SymbolTable by the caller. SymbolTable needs to be refactored to 283 // prevent manual building of Ops with symbols in code using SymbolTables 284 // and then this needs to use the OpBuilder. 285 auto context = getOperation().getContext(); 286 OpBuilder builder(context); 287 auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(), 288 kernelFunc.getName()); 289 SymbolTable symbolTable(kernelModule); 290 symbolTable.insert(kernelFunc); 291 292 SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc}; 293 while (!symbolDefWorklist.empty()) { 294 if (Optional<SymbolTable::UseRange> symbolUses = 295 SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) { 296 for (SymbolTable::SymbolUse symbolUse : *symbolUses) { 297 StringRef symbolName = 298 symbolUse.getSymbolRef().cast<FlatSymbolRefAttr>().getValue(); 299 if (symbolTable.lookup(symbolName)) 300 continue; 301 302 Operation *symbolDefClone = 303 parentSymbolTable.lookup(symbolName)->clone(); 304 symbolDefWorklist.push_back(symbolDefClone); 305 symbolTable.insert(symbolDefClone); 306 } 307 } 308 } 309 310 return kernelModule; 311 } 312 }; 313 314 } // namespace 315 316 std::unique_ptr<OperationPass<ModuleOp>> mlir::createGpuKernelOutliningPass() { 317 return std::make_unique<GpuKernelOutliningPass>(); 318 } 319