1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the GPU dialect kernel outlining pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "PassDetail.h" 14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 15 #include "mlir/Dialect/DLTI/DLTI.h" 16 #include "mlir/Dialect/GPU/GPUDialect.h" 17 #include "mlir/Dialect/GPU/Passes.h" 18 #include "mlir/Dialect/GPU/Utils.h" 19 #include "mlir/Dialect/MemRef/IR/MemRef.h" 20 #include "mlir/Dialect/StandardOps/IR/Ops.h" 21 #include "mlir/IR/BlockAndValueMapping.h" 22 #include "mlir/IR/Builders.h" 23 #include "mlir/IR/SymbolTable.h" 24 #include "mlir/Parser.h" 25 #include "mlir/Support/LLVM.h" 26 #include "mlir/Transforms/RegionUtils.h" 27 28 using namespace mlir; 29 30 template <typename OpTy> 31 static void createForAllDimensions(OpBuilder &builder, Location loc, 32 SmallVectorImpl<Value> &values) { 33 for (StringRef dim : {"x", "y", "z"}) { 34 Value v = builder.create<OpTy>(loc, builder.getIndexType(), 35 builder.getStringAttr(dim)); 36 values.push_back(v); 37 } 38 } 39 40 /// Adds operations generating block/thread ids and grid/block dimensions at the 41 /// beginning of the `launchFuncOpBody` region. Add mapping from argument in 42 /// entry block of `launchOpBody`, to the corresponding result value of the 43 /// added operations. 44 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, 45 Region &launchOpBody, 46 BlockAndValueMapping &map) { 47 OpBuilder builder(loc->getContext()); 48 Block &firstBlock = launchOpBody.front(); 49 builder.setInsertionPointToStart(&launchFuncOpBody.front()); 50 SmallVector<Value, 12> indexOps; 51 createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps); 52 createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps); 53 createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps); 54 createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps); 55 // Replace the leading 12 function args with the respective thread/block index 56 // operations. Iterate backwards since args are erased and indices change. 57 for (const auto &indexOp : enumerate(indexOps)) 58 map.map(firstBlock.getArgument(indexOp.index()), indexOp.value()); 59 } 60 61 /// Identifies operations that are beneficial to sink into kernels. These 62 /// operations may not have side-effects, as otherwise sinking (and hence 63 /// duplicating them) is not legal. 64 static bool isSinkingBeneficiary(Operation *op) { 65 return isa<arith::ConstantOp, ConstantOp, memref::DimOp, SelectOp, 66 arith::CmpIOp>(op); 67 } 68 69 /// For a given operation `op`, computes whether it is beneficial to sink the 70 /// operation into the kernel. An operation can be sunk if doing so does not 71 /// introduce new kernel arguments. Whether a value is already available in the 72 /// kernel (and hence does not introduce new arguments) is checked by 73 /// querying `existingDependencies` and `availableValues`. 74 /// If an operand is not yet available, we recursively check whether it can be 75 /// made available by siking its defining op. 76 /// Operations that are indentified for sinking are added to `beneficiaryOps` in 77 /// the order they should appear in the kernel. Furthermore, `availableValues` 78 /// is updated with results that will be available after sinking the identified 79 /// ops. 80 static bool 81 extractBeneficiaryOps(Operation *op, 82 const SetVector<Value> &existingDependencies, 83 SetVector<Operation *> &beneficiaryOps, 84 llvm::SmallPtrSetImpl<Value> &availableValues) { 85 if (beneficiaryOps.count(op)) 86 return true; 87 88 if (!isSinkingBeneficiary(op)) 89 return false; 90 91 for (Value operand : op->getOperands()) { 92 // It is already visible in the kernel, keep going. 93 if (availableValues.count(operand)) 94 continue; 95 // Else check whether it can be made available via sinking or already is a 96 // dependency. 97 Operation *definingOp = operand.getDefiningOp(); 98 if ((!definingOp || 99 !extractBeneficiaryOps(definingOp, existingDependencies, 100 beneficiaryOps, availableValues)) && 101 !existingDependencies.count(operand)) 102 return false; 103 } 104 // We will sink the operation, mark its results as now available. 105 beneficiaryOps.insert(op); 106 for (Value result : op->getResults()) 107 availableValues.insert(result); 108 return true; 109 } 110 111 LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) { 112 Region &launchOpBody = launchOp.body(); 113 114 // Identify uses from values defined outside of the scope of the launch 115 // operation. 116 SetVector<Value> sinkCandidates; 117 getUsedValuesDefinedAbove(launchOpBody, sinkCandidates); 118 119 SetVector<Operation *> toBeSunk; 120 llvm::SmallPtrSet<Value, 4> availableValues; 121 for (Value operand : sinkCandidates) { 122 Operation *operandOp = operand.getDefiningOp(); 123 if (!operandOp) 124 continue; 125 extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues); 126 } 127 128 // Insert operations so that the defs get cloned before uses. 129 BlockAndValueMapping map; 130 OpBuilder builder(launchOpBody); 131 for (Operation *op : toBeSunk) { 132 Operation *clonedOp = builder.clone(*op, map); 133 // Only replace uses within the launch op. 134 for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults())) 135 replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair), 136 launchOp.body()); 137 } 138 return success(); 139 } 140 141 /// Outline the `gpu.launch` operation body into a kernel function. Replace 142 /// `gpu.terminator` operations by `gpu.return` in the generated function. 143 static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, 144 StringRef kernelFnName, 145 SetVector<Value> &operands) { 146 Location loc = launchOp.getLoc(); 147 // Create a builder with no insertion point, insertion will happen separately 148 // due to symbol table manipulation. 149 OpBuilder builder(launchOp.getContext()); 150 Region &launchOpBody = launchOp.body(); 151 152 // Identify uses from values defined outside of the scope of the launch 153 // operation. 154 getUsedValuesDefinedAbove(launchOpBody, operands); 155 156 // Create the gpu.func operation. 157 SmallVector<Type, 4> kernelOperandTypes; 158 kernelOperandTypes.reserve(operands.size()); 159 for (Value operand : operands) { 160 kernelOperandTypes.push_back(operand.getType()); 161 } 162 FunctionType type = 163 FunctionType::get(launchOp.getContext(), kernelOperandTypes, {}); 164 auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type); 165 outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), 166 builder.getUnitAttr()); 167 BlockAndValueMapping map; 168 169 // Map the arguments corresponding to the launch parameters like blockIdx, 170 // threadIdx, etc. 171 Region &outlinedFuncBody = outlinedFunc.body(); 172 injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map); 173 174 // Map arguments from gpu.launch region to the arguments of the gpu.func 175 // operation. 176 Block &entryBlock = outlinedFuncBody.front(); 177 for (const auto &operand : enumerate(operands)) 178 map.map(operand.value(), entryBlock.getArgument(operand.index())); 179 180 // Clone the region of the gpu.launch operation into the gpu.func operation. 181 // TODO: If cloneInto can be modified such that if a mapping for 182 // a block exists, that block will be used to clone operations into (at the 183 // end of the block), instead of creating a new block, this would be much 184 // cleaner. 185 launchOpBody.cloneInto(&outlinedFuncBody, map); 186 187 // Branch from entry of the gpu.func operation to the block that is cloned 188 // from the entry block of the gpu.launch operation. 189 Block &launchOpEntry = launchOpBody.front(); 190 Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry); 191 builder.setInsertionPointToEnd(&entryBlock); 192 builder.create<BranchOp>(loc, clonedLaunchOpEntry); 193 194 outlinedFunc.walk([](gpu::TerminatorOp op) { 195 OpBuilder replacer(op); 196 replacer.create<gpu::ReturnOp>(op.getLoc()); 197 op.erase(); 198 }); 199 return outlinedFunc; 200 } 201 202 gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp, 203 StringRef kernelFnName, 204 llvm::SmallVectorImpl<Value> &operands) { 205 DenseSet<Value> inputOperandSet; 206 inputOperandSet.insert(operands.begin(), operands.end()); 207 SetVector<Value> operandSet(operands.begin(), operands.end()); 208 auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet); 209 for (auto operand : operandSet) { 210 if (!inputOperandSet.count(operand)) 211 operands.push_back(operand); 212 } 213 return funcOp; 214 } 215 216 /// Replace `gpu.launch` operations with an `gpu.launch_func` operation 217 /// launching `kernelFunc`. The kernel func contains the body of the 218 /// `gpu.launch` with constant region arguments inlined. 219 static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, 220 gpu::GPUFuncOp kernelFunc, 221 ValueRange operands) { 222 OpBuilder builder(launchOp); 223 // The launch op has an optional dynamic shared memory size. If it doesn't 224 // exist, we use zero. 225 builder.create<gpu::LaunchFuncOp>( 226 launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), 227 launchOp.getBlockSizeOperandValues(), launchOp.dynamicSharedMemorySize(), 228 operands); 229 launchOp.erase(); 230 } 231 232 namespace { 233 /// Pass that moves the kernel of each LaunchOp into its separate nested module. 234 /// 235 /// This pass moves the kernel code of each LaunchOp into a function created 236 /// inside a nested module. It also creates an external function of the same 237 /// name in the parent module. 238 /// 239 /// The gpu.modules are intended to be compiled to a cubin blob independently in 240 /// a separate pass. The external functions can then be annotated with the 241 /// symbol of the cubin accessor function. 242 class GpuKernelOutliningPass 243 : public GpuKernelOutliningBase<GpuKernelOutliningPass> { 244 public: 245 GpuKernelOutliningPass(StringRef dlStr) { 246 if (!dlStr.empty() && !dataLayoutStr.hasValue()) 247 dataLayoutStr = dlStr.str(); 248 } 249 250 GpuKernelOutliningPass(const GpuKernelOutliningPass &other) 251 : dataLayoutSpec(other.dataLayoutSpec) { 252 dataLayoutStr = other.dataLayoutStr; 253 } 254 255 LogicalResult initialize(MLIRContext *context) override { 256 // Initialize the data layout specification from the data layout string. 257 if (!dataLayoutStr.empty()) { 258 Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context); 259 if (!resultAttr) 260 return failure(); 261 262 dataLayoutSpec = resultAttr.dyn_cast<DataLayoutSpecInterface>(); 263 if (!dataLayoutSpec) 264 return failure(); 265 } 266 267 return success(); 268 } 269 270 void runOnOperation() override { 271 SymbolTable symbolTable(getOperation()); 272 bool modified = false; 273 for (auto func : getOperation().getOps<FuncOp>()) { 274 // Insert just after the function. 275 Block::iterator insertPt(func->getNextNode()); 276 auto funcWalkResult = func.walk([&](gpu::LaunchOp op) { 277 SetVector<Value> operands; 278 std::string kernelFnName = 279 Twine(op->getParentOfType<FuncOp>().getName(), "_kernel").str(); 280 281 // Pull in instructions that can be sunk 282 if (failed(sinkOperationsIntoLaunchOp(op))) 283 return WalkResult::interrupt(); 284 gpu::GPUFuncOp outlinedFunc = 285 outlineKernelFuncImpl(op, kernelFnName, operands); 286 287 // Create nested module and insert outlinedFunc. The module will 288 // originally get the same name as the function, but may be renamed on 289 // insertion into the parent module. 290 auto kernelModule = createKernelModule(outlinedFunc, symbolTable); 291 symbolTable.insert(kernelModule, insertPt); 292 293 // Potentially changes signature, pulling in constants. 294 convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef()); 295 modified = true; 296 return WalkResult::advance(); 297 }); 298 if (funcWalkResult.wasInterrupted()) 299 return signalPassFailure(); 300 } 301 302 // If any new module was inserted in this module, annotate this module as 303 // a container module. 304 if (modified) 305 getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(), 306 UnitAttr::get(&getContext())); 307 } 308 309 private: 310 /// Returns a gpu.module containing kernelFunc and all callees (recursive). 311 gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, 312 const SymbolTable &parentSymbolTable) { 313 // TODO: This code cannot use an OpBuilder because it must be inserted into 314 // a SymbolTable by the caller. SymbolTable needs to be refactored to 315 // prevent manual building of Ops with symbols in code using SymbolTables 316 // and then this needs to use the OpBuilder. 317 auto *context = getOperation().getContext(); 318 OpBuilder builder(context); 319 auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(), 320 kernelFunc.getName()); 321 322 // If a valid data layout spec was provided, attach it to the kernel module. 323 // Otherwise, the default data layout will be used. 324 if (dataLayoutSpec) 325 kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec); 326 327 SymbolTable symbolTable(kernelModule); 328 symbolTable.insert(kernelFunc); 329 330 SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc}; 331 while (!symbolDefWorklist.empty()) { 332 if (Optional<SymbolTable::UseRange> symbolUses = 333 SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) { 334 for (SymbolTable::SymbolUse symbolUse : *symbolUses) { 335 StringRef symbolName = 336 symbolUse.getSymbolRef().cast<FlatSymbolRefAttr>().getValue(); 337 if (symbolTable.lookup(symbolName)) 338 continue; 339 340 Operation *symbolDefClone = 341 parentSymbolTable.lookup(symbolName)->clone(); 342 symbolDefWorklist.push_back(symbolDefClone); 343 symbolTable.insert(symbolDefClone); 344 } 345 } 346 } 347 348 return kernelModule; 349 } 350 351 Option<std::string> dataLayoutStr{ 352 *this, "data-layout-str", 353 llvm::cl::desc("String containing the data layout specification to be " 354 "attached to the GPU kernel module")}; 355 356 DataLayoutSpecInterface dataLayoutSpec; 357 }; 358 359 } // namespace 360 361 std::unique_ptr<OperationPass<ModuleOp>> 362 mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) { 363 return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr); 364 } 365