1 //===- Utils.cpp - Utilities to support the Linalg dialect ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements utilities for the Linalg dialect. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Linalg/Utils/Utils.h" 14 15 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h" 16 #include "mlir/Dialect/Affine/IR/AffineOps.h" 17 #include "mlir/Dialect/Linalg/IR/LinalgOps.h" 18 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h" 19 #include "mlir/Dialect/SCF/EDSC/Builders.h" 20 #include "mlir/Dialect/SCF/SCF.h" 21 #include "mlir/Dialect/StandardOps/IR/Ops.h" 22 #include "mlir/IR/AffineExpr.h" 23 #include "mlir/IR/AffineMap.h" 24 #include "mlir/IR/Matchers.h" 25 #include "mlir/IR/OpImplementation.h" 26 #include "mlir/Pass/Pass.h" 27 #include "mlir/Transforms/LoopUtils.h" 28 29 using namespace mlir; 30 using namespace mlir::linalg; 31 using namespace mlir::scf; 32 33 Optional<RegionMatcher::BinaryOpKind> 34 RegionMatcher::matchAsScalarBinaryOp(GenericOp op) { 35 auto ®ion = op.region(); 36 if (!llvm::hasSingleElement(region)) 37 return llvm::None; 38 39 Block &block = region.front(); 40 if (block.getNumArguments() != 2 || 41 !block.getArgument(0).getType().isSignlessIntOrFloat() || 42 !block.getArgument(1).getType().isSignlessIntOrFloat()) 43 return llvm::None; 44 45 auto &ops = block.getOperations(); 46 if (!llvm::hasSingleElement(block.without_terminator())) 47 return llvm::None; 48 49 using mlir::matchers::m_Val; 50 auto a = m_Val(block.getArgument(0)); 51 auto b = m_Val(block.getArgument(1)); 52 53 auto addPattern = m_Op<linalg::YieldOp>(m_Op<AddIOp>(a, b)); 54 if (addPattern.match(&ops.back())) 55 return BinaryOpKind::IAdd; 56 57 return llvm::None; 58 } 59 60 bool mlir::linalg::isParallelIteratorType(Attribute attr) { 61 if (auto strAttr = attr.dyn_cast<StringAttr>()) { 62 return strAttr.getValue() == getParallelIteratorTypeName(); 63 } 64 return false; 65 } 66 67 bool mlir::linalg::isReductionIteratorType(Attribute attr) { 68 if (auto strAttr = attr.dyn_cast<StringAttr>()) { 69 return strAttr.getValue() == getReductionIteratorTypeName(); 70 } 71 return false; 72 } 73 74 bool mlir::linalg::isWindowIteratorType(Attribute attr) { 75 if (auto strAttr = attr.dyn_cast<StringAttr>()) { 76 return strAttr.getValue() == getWindowIteratorTypeName(); 77 } 78 return false; 79 } 80 81 /// Explicit instantiation of loop nest generator for different loop types. 82 template struct mlir::linalg::GenerateLoopNest<scf::ForOp>; 83 template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>; 84 template struct mlir::linalg::GenerateLoopNest<AffineForOp>; 85 86 /// Given a list of subview ranges, extract individual values for lower, upper 87 /// bounds and steps and put them into the corresponding vectors. 88 static void unpackRanges(ArrayRef<Range> ranges, SmallVectorImpl<Value> &lbs, 89 SmallVectorImpl<Value> &ubs, 90 SmallVectorImpl<Value> &steps) { 91 for (Range range : ranges) { 92 lbs.emplace_back(range.offset); 93 ubs.emplace_back(range.size); 94 steps.emplace_back(range.stride); 95 } 96 } 97 98 namespace mlir { 99 namespace linalg { 100 101 SmallVector<int64_t, 8> getStaticShape(LinalgOp linalgOp) { 102 SmallVector<int64_t, 8> res; 103 for (Value v : linalgOp.getShapedOperands()) { 104 auto shape = v.getType().cast<ShapedType>().getShape(); 105 res.append(shape.begin(), shape.end()); 106 } 107 return res; 108 } 109 110 Optional<SmallVector<int64_t, 4>> getStaticLoopRanges(LinalgOp linalgOp) { 111 SmallVector<int64_t, 8> viewSizes = getStaticShape(linalgOp); 112 AffineMap invertedMap = linalgOp.getShapesToLoopsMap(); 113 if (!invertedMap) 114 return {}; 115 return invertedMap.compose(viewSizes); 116 } 117 118 /// If `size` comes from an AffineMinOp and one of the values of AffineMinOp 119 /// is a constant then return a new value set to the smallest such constant. 120 /// Otherwise returngetSmallestBoundingIndex nullptr. 121 IntegerAttr getSmallestBoundingIndex(Value size) { 122 Optional<int64_t> boundingConst = {}; 123 if (auto affineMinOp = size.getDefiningOp<AffineMinOp>()) { 124 for (auto e : affineMinOp.getAffineMap().getResults()) 125 if (auto cst = e.dyn_cast<AffineConstantExpr>()) 126 boundingConst = boundingConst 127 ? std::min(boundingConst.getValue(), cst.getValue()) 128 : cst.getValue(); 129 } else if (auto constIndexOp = size.getDefiningOp<ConstantOp>()) { 130 if (constIndexOp.getType().isa<IndexType>()) 131 boundingConst = constIndexOp.value().cast<IntegerAttr>().getInt(); 132 } else if (auto affineApplyOp = size.getDefiningOp<AffineApplyOp>()) { 133 if (auto cExpr = affineApplyOp.getAffineMap() 134 .getResult(0) 135 .dyn_cast<AffineConstantExpr>()) 136 boundingConst = cExpr.getValue(); 137 } 138 if (boundingConst && *boundingConst >= 0) 139 return Builder(size.getContext()).getIndexAttr(*boundingConst); 140 return nullptr; 141 } 142 143 /// Specialization to build an scf "for" nest. 144 template <> 145 void GenerateLoopNest<scf::ForOp>::doit( 146 ArrayRef<Range> loopRanges, ValueRange iterArgInitValues, 147 ArrayRef<Attribute> iteratorTypes, 148 function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn, 149 Optional<LinalgLoopDistributionOptions> distributionOptions) { 150 // Create procInfo so it dominates loops, if appropriate. 151 OpBuilder &builder = edsc::ScopedContext::getBuilderRef(); 152 Location loc = edsc::ScopedContext::getLocation(); 153 SmallVector<ProcInfo, 2> procInfo; 154 if (distributionOptions.hasValue()) 155 procInfo = distributionOptions->procInfo(builder, loc, loopRanges); 156 157 SmallVector<Value, 4> lbs, ubs, steps; 158 unpackRanges(loopRanges, lbs, ubs, steps); 159 LoopNest loopNest = 160 edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn); 161 162 if (!distributionOptions.hasValue() || loopNest.loops.empty()) 163 return; 164 165 // Only supports cyclic distribution for now. 166 for (auto it : llvm::zip(loopNest.loops, procInfo, 167 distributionOptions->distributionMethod)) 168 if (std::get<2>(it) == DistributionMethod::Cyclic) 169 mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId, 170 std::get<1>(it).nprocs); 171 } 172 173 /// Specialization to build affine "for" nest. 174 template <> 175 void GenerateLoopNest<AffineForOp>::doit( 176 ArrayRef<Range> loopRanges, ValueRange iterArgInitValues, 177 ArrayRef<Attribute> iteratorTypes, 178 function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn, 179 Optional<LinalgLoopDistributionOptions>) { 180 assert(iterArgInitValues.empty() && "unexpected AffineForOp init values"); 181 SmallVector<Value, 4> lbs, ubs, steps; 182 unpackRanges(loopRanges, lbs, ubs, steps); 183 184 // Affine loops require constant steps. 185 SmallVector<int64_t, 4> constantSteps; 186 constantSteps.reserve(steps.size()); 187 for (Value v : steps) { 188 auto op = v.getDefiningOp<ConstantIndexOp>(); 189 assert(op && "Affine loops require constant steps"); 190 constantSteps.push_back(op.getValue()); 191 } 192 193 auto bodyBuilderWithoutIterArgsFn = [&](ValueRange ivs) { 194 bodyBuilderFn(ivs, {}); 195 }; 196 edsc::affineLoopNestBuilder(lbs, ubs, constantSteps, 197 bodyBuilderWithoutIterArgsFn); 198 } 199 200 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`. 201 static void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc, 202 Value procId, Value nprocs, 203 Value &lb, Value &ub, 204 Value &step) { 205 using edsc::op::operator+; 206 using edsc::op::operator*; 207 lb = lb + (procId * step); 208 step = nprocs * step; 209 } 210 211 /// Generates a loop nest consisting of scf.parallel and scf.for, depending 212 /// on the `iteratorTypes.` Consecutive parallel loops create a single 213 /// scf.parallel operation; each sequential loop creates a new scf.for 214 /// operation. The body of the innermost loop is populated by 215 /// `bodyBuilderFn` that accepts a range of induction variables for all 216 /// loops. `ivStorage` is used to store the partial list of induction 217 /// variables. 218 // TODO: this function can be made iterative instead. However, it 219 // will have at most as many recursive calls as nested loops, which rarely 220 // exceeds 10. 221 static void 222 generateParallelLoopNest(ValueRange lbs, ValueRange ubs, ValueRange steps, 223 ArrayRef<Attribute> iteratorTypes, 224 function_ref<void(ValueRange)> bodyBuilderFn, 225 SmallVectorImpl<Value> &ivStorage, 226 ArrayRef<DistributionMethod> distributionMethod = {}) { 227 assert(lbs.size() == ubs.size()); 228 assert(lbs.size() == steps.size()); 229 assert(lbs.size() == iteratorTypes.size()); 230 231 // If there are no (more) loops to be generated, generate the body and be 232 // done with it. 233 if (iteratorTypes.empty()) 234 return bodyBuilderFn(ivStorage); 235 236 // Find the outermost parallel loops and drop their types from the list. 237 unsigned nLoops = iteratorTypes.size(); 238 unsigned nOuterPar = 239 nLoops - iteratorTypes.drop_while(isParallelIteratorType).size(); 240 241 // If there are no outer parallel loops, generate one sequential loop and 242 // recurse. Note that we wouldn't have dropped anything from `iteratorTypes` 243 // in this case. 244 if (nOuterPar == 0) { 245 edsc::loopNestBuilder(lbs[0], ubs[0], steps[0], [&](Value iv) { 246 ivStorage.push_back(iv); 247 generateParallelLoopNest(lbs.drop_front(), ubs.drop_front(), 248 steps.drop_front(), iteratorTypes.drop_front(), 249 bodyBuilderFn, ivStorage, distributionMethod); 250 }); 251 return; 252 } 253 if (distributionMethod.empty()) { 254 // Generate a single parallel loop-nest operation for all outermost 255 // parallel loops and recurse. 256 edsc::OperationBuilder<scf::ParallelOp>( 257 lbs.take_front(nOuterPar), ubs.take_front(nOuterPar), 258 steps.take_front(nOuterPar), 259 [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { 260 edsc::ScopedContext context(nestedBuilder, nestedLoc); 261 ivStorage.append(localIvs.begin(), localIvs.end()); 262 generateParallelLoopNest( 263 lbs.drop_front(nOuterPar), ubs.drop_front(nOuterPar), 264 steps.drop_front(nOuterPar), iteratorTypes.drop_front(nOuterPar), 265 bodyBuilderFn, ivStorage, 266 (distributionMethod.size() < nOuterPar) 267 ? ArrayRef<DistributionMethod>() 268 : distributionMethod.drop_front(nOuterPar)); 269 }); 270 return; 271 } 272 273 // Process all consecutive similarly distributed loops simultaneously. 274 DistributionMethod methodToUse = distributionMethod[0]; 275 unsigned numProcessed = 1; 276 for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) { 277 if (distributionMethod[i] != methodToUse) 278 break; 279 numProcessed++; 280 } 281 282 switch (methodToUse) { 283 case DistributionMethod::Cyclic: { 284 // Generate a single parallel loop-nest operation for all outermost 285 // parallel loops and recurse. 286 edsc::OperationBuilder<scf::ParallelOp>( 287 lbs.take_front(numProcessed), ubs.take_front(numProcessed), 288 steps.take_front(numProcessed), 289 [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { 290 edsc::ScopedContext context(nestedBuilder, nestedLoc); 291 ivStorage.append(localIvs.begin(), localIvs.end()); 292 generateParallelLoopNest( 293 lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), 294 steps.drop_front(numProcessed), 295 iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, 296 (distributionMethod.size() < numProcessed) 297 ? ArrayRef<DistributionMethod>() 298 : distributionMethod.drop_front(numProcessed)); 299 }); 300 return; 301 } 302 case DistributionMethod::CyclicNumProcsGeNumIters: { 303 // Check (for the processed loops) that the iteration is in-bounds. 304 using edsc::op::slt; 305 using edsc::op::operator&&; 306 Value cond = slt(lbs[0], ubs[0]); 307 for (unsigned i = 1; i < numProcessed; ++i) 308 cond = cond && slt(lbs[i], ubs[i]); 309 ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); 310 edsc::conditionBuilder(cond, [&]() { 311 generateParallelLoopNest( 312 lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), 313 steps.drop_front(numProcessed), 314 iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, 315 distributionMethod.drop_front(numProcessed)); 316 }); 317 return; 318 } 319 case DistributionMethod::CyclicNumProcsEqNumIters: 320 // No check/loops needed here. Set the `%iv` to be the `%lb` and proceed 321 // with inner loop generation. 322 ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); 323 generateParallelLoopNest( 324 lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), 325 steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed), 326 bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed)); 327 return; 328 } 329 } 330 331 /// Specialization for generating a mix of parallel and sequential scf loops. 332 template <> 333 void GenerateLoopNest<scf::ParallelOp>::doit( 334 ArrayRef<Range> loopRanges, ValueRange iterArgInitValues, 335 ArrayRef<Attribute> iteratorTypes, 336 function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn, 337 Optional<LinalgLoopDistributionOptions> distributionOptions) { 338 assert(iterArgInitValues.empty() && "unexpected ParallelOp init values"); 339 // This function may be passed more iterator types than ranges. 340 assert(iteratorTypes.size() >= loopRanges.size() && 341 "expected iterator type for all ranges"); 342 iteratorTypes = iteratorTypes.take_front(loopRanges.size()); 343 SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs; 344 unsigned numLoops = iteratorTypes.size(); 345 ivs.reserve(numLoops); 346 lbsStorage.reserve(numLoops); 347 ubsStorage.reserve(numLoops); 348 stepsStorage.reserve(numLoops); 349 350 // Get the loop lb, ub, and step. 351 unpackRanges(loopRanges, lbsStorage, ubsStorage, stepsStorage); 352 353 // Modify the lb, ub, and step based on the distribution options. 354 SmallVector<DistributionMethod, 0> distributionMethod; 355 if (distributionOptions) { 356 auto &options = distributionOptions.getValue(); 357 OpBuilder &builder = edsc::ScopedContext::getBuilderRef(); 358 Location loc = edsc::ScopedContext::getLocation(); 359 distributionMethod.assign(distributionOptions->distributionMethod.begin(), 360 distributionOptions->distributionMethod.end()); 361 SmallVector<Range, 2> parallelLoopRanges; 362 for (auto iteratorType : enumerate(iteratorTypes)) { 363 if (isParallelIteratorType(iteratorType.value())) 364 parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); 365 } 366 if (distributionMethod.size() < parallelLoopRanges.size()) 367 parallelLoopRanges.resize(distributionMethod.size()); 368 SmallVector<ProcInfo, 2> procInfo = 369 options.procInfo(builder, loc, parallelLoopRanges); 370 unsigned index = 0; 371 for (auto iteratorType : enumerate(iteratorTypes)) { 372 if (index >= procInfo.size()) 373 break; 374 if (isParallelIteratorType(iteratorType.value())) { 375 unsigned i = iteratorType.index(); 376 updateBoundsForCyclicDistribution(builder, loc, procInfo[index].procId, 377 procInfo[index].nprocs, lbsStorage[i], 378 ubsStorage[i], stepsStorage[i]); 379 index++; 380 } 381 } 382 } 383 ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage); 384 auto bodyBuilderWithoutIterArgsFn = [&](ValueRange ivs) { 385 bodyBuilderFn(ivs, {}); 386 }; 387 generateParallelLoopNest(lbs, ubs, steps, iteratorTypes, 388 bodyBuilderWithoutIterArgsFn, ivs, 389 distributionMethod); 390 391 assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops"); 392 } 393 394 } // namespace linalg 395 } // namespace mlir 396