1 //===- Utils.cpp - Utilities to support the Linalg dialect ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements utilities for the Linalg dialect. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Linalg/Utils/Utils.h" 14 15 #include "mlir/Analysis/SliceAnalysis.h" 16 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" 17 #include "mlir/Dialect/Affine/IR/AffineOps.h" 18 #include "mlir/Dialect/Affine/IR/AffineValueMap.h" 19 #include "mlir/Dialect/Affine/LoopUtils.h" 20 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 21 #include "mlir/Dialect/Arithmetic/Utils/Utils.h" 22 #include "mlir/Dialect/Linalg/IR/Linalg.h" 23 #include "mlir/Dialect/MemRef/IR/MemRef.h" 24 #include "mlir/Dialect/SCF/SCF.h" 25 #include "mlir/Dialect/Tensor/IR/Tensor.h" 26 #include "mlir/Dialect/Tensor/Utils/Utils.h" 27 #include "mlir/Dialect/Utils/StaticValueUtils.h" 28 #include "mlir/IR/AffineExpr.h" 29 #include "mlir/IR/AffineExprVisitor.h" 30 #include "mlir/IR/AffineMap.h" 31 #include "mlir/IR/Matchers.h" 32 #include "mlir/IR/OpImplementation.h" 33 #include "mlir/Pass/Pass.h" 34 #include "llvm/ADT/TypeSwitch.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "linalg-utils" 38 39 using namespace mlir; 40 using namespace mlir::linalg; 41 using namespace mlir::scf; 42 43 static bool isZero(Value v) { 44 if (auto cst = v.getDefiningOp<arith::ConstantIndexOp>()) 45 return cst.value() == 0; 46 return false; 47 } 48 49 namespace { 50 51 // Helper visitor to determine whether an AffineExpr is tiled. 52 // This is achieved by traversing every AffineDimExpr with position `pos` and 53 // checking whether the corresponding `tileSizes[pos]` is non-zero. 54 // This also enforces only positive coefficients occur in multiplications. 55 // 56 // Example: 57 // `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0] 58 // 59 struct TileCheck : public AffineExprVisitor<TileCheck> { 60 TileCheck(ValueRange tileSizes) : tileSizes(tileSizes) {} 61 62 void visitDimExpr(AffineDimExpr expr) { 63 isTiled |= !isZero(tileSizes[expr.getPosition()]); 64 } 65 void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) { 66 visit(expr.getLHS()); 67 visit(expr.getRHS()); 68 if (expr.getKind() == mlir::AffineExprKind::Mul) 69 assert(expr.getRHS().cast<AffineConstantExpr>().getValue() > 0 && 70 "nonpositive multiplying coefficient"); 71 } 72 bool isTiled = false; 73 ValueRange tileSizes; 74 }; 75 76 } // namespace 77 78 static bool isTiled(AffineExpr expr, ValueRange tileSizes) { 79 if (!expr) 80 return false; 81 TileCheck t(tileSizes); 82 t.visit(expr); 83 return t.isTiled; 84 } 85 86 // Checks whether the `map varies with respect to a non-zero `tileSize`. 87 static bool isTiled(AffineMap map, ValueRange tileSizes) { 88 if (!map) 89 return false; 90 for (unsigned r = 0; r < map.getNumResults(); ++r) 91 if (isTiled(map.getResult(r), tileSizes)) 92 return true; 93 return false; 94 } 95 96 Optional<RegionMatcher::BinaryOpKind> 97 RegionMatcher::matchAsScalarBinaryOp(GenericOp op) { 98 auto ®ion = op.region(); 99 if (!llvm::hasSingleElement(region)) 100 return llvm::None; 101 102 Block &block = region.front(); 103 if (block.getNumArguments() != 2 || 104 !block.getArgument(0).getType().isSignlessIntOrFloat() || 105 !block.getArgument(1).getType().isSignlessIntOrFloat()) 106 return llvm::None; 107 108 auto &ops = block.getOperations(); 109 if (!llvm::hasSingleElement(block.without_terminator())) 110 return llvm::None; 111 112 using mlir::matchers::m_Val; 113 auto a = m_Val(block.getArgument(0)); 114 auto b = m_Val(block.getArgument(1)); 115 116 auto addPattern = m_Op<linalg::YieldOp>(m_Op<arith::AddIOp>(a, b)); 117 if (addPattern.match(&ops.back())) 118 return BinaryOpKind::IAdd; 119 120 return llvm::None; 121 } 122 123 /// Explicit instantiation of loop nest generator for different loop types. 124 template struct mlir::linalg::GenerateLoopNest<scf::ForOp>; 125 template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>; 126 template struct mlir::linalg::GenerateLoopNest<AffineForOp>; 127 128 /// Given a list of subview ranges, extract individual values for lower, upper 129 /// bounds and steps and put them into the corresponding vectors. 130 static void unpackRanges(ArrayRef<Range> ranges, SmallVectorImpl<Value> &lbs, 131 SmallVectorImpl<Value> &ubs, 132 SmallVectorImpl<Value> &steps) { 133 for (Range range : ranges) { 134 lbs.emplace_back(range.offset); 135 ubs.emplace_back(range.size); 136 steps.emplace_back(range.stride); 137 } 138 } 139 140 namespace mlir { 141 namespace linalg { 142 143 bool isPermutation(ArrayRef<int64_t> permutation) { 144 // Count the number of appearances for all indices. 145 SmallVector<int64_t> indexCounts(permutation.size(), 0); 146 for (auto index : permutation) { 147 // Exit if the index is out-of-range. 148 if (index < 0 || index >= static_cast<int64_t>(permutation.size())) 149 return false; 150 indexCounts[index]++; 151 } 152 // Return true if all indices appear once. 153 return count(indexCounts, 1) == static_cast<int64_t>(permutation.size()); 154 } 155 156 /// Helper function that creates a memref::DimOp or tensor::DimOp depending on 157 /// the type of `source`. 158 Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim) { 159 if (source.getType().isa<UnrankedMemRefType, MemRefType>()) 160 return b.createOrFold<memref::DimOp>(loc, source, dim); 161 if (source.getType().isa<UnrankedTensorType, RankedTensorType>()) 162 return b.createOrFold<tensor::DimOp>(loc, source, dim); 163 llvm_unreachable("Expected MemRefType or TensorType"); 164 } 165 166 /// Given an operation, retrieves the value of each dynamic dimension through 167 /// constructing the necessary DimOp operators. 168 SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b) { 169 SmallVector<Value, 4> dynOperands; 170 auto shapedType = val.getType().cast<ShapedType>(); 171 for (const auto &dim : llvm::enumerate(shapedType.getShape())) { 172 if (dim.value() == ShapedType::kDynamicSize) 173 dynOperands.push_back(createOrFoldDimOp(b, loc, val, dim.index())); 174 } 175 return dynOperands; 176 } 177 178 void getUpperBoundForIndex(Value value, AffineMap &boundMap, 179 SmallVectorImpl<Value> &boundOperands) { 180 // Initialize `boundMap` and `boundOperands` to the identity returning 181 // `value`. This combination is the default result of the method if no 182 // simplification is possible. 183 assert(value.getType().isIndex() && "expect value to have index type"); 184 boundMap = AffineMap::getMultiDimIdentityMap(1, value.getContext()); 185 boundOperands.assign({value}); 186 canonicalizeMapAndOperands(&boundMap, &boundOperands); 187 188 // Continue only if there is an affine index computation to simplify. 189 Operation *definingOp = value.getDefiningOp(); 190 if (!definingOp || !isa<AffineApplyOp, AffineMinOp>(definingOp)) 191 return; 192 193 // Get the backward slice containing the affine index computation. 194 SetVector<Operation *> backwardSlice; 195 getBackwardSlice(definingOp, &backwardSlice, [](Operation *op) { 196 return isa<AffineApplyOp, AffineMinOp>(op); 197 }); 198 backwardSlice.insert(definingOp); 199 200 // Setup a system of affine constraints that describe the index computation. 201 FlatAffineValueConstraints constraints; 202 203 // Helper to find or create an identifier for the given value. 204 auto findOrCreateId = [&](Value value) { 205 if (!constraints.containsId(value)) { 206 constraints.appendDimId(value); 207 return true; 208 } 209 unsigned pos; 210 constraints.findId(value, &pos); 211 return pos < constraints.getNumDimIds(); 212 }; 213 // Helper to get the position for the given value. 214 auto getPosition = [&](Value value) { 215 unsigned pos; 216 bool exists = constraints.findId(value, &pos); 217 (void)exists; 218 assert(exists && "expect to find the identifier"); 219 return pos; 220 }; 221 222 // Add the affine operations in `backwardSlice` to the constraints. 223 for (Operation *op : llvm::reverse(backwardSlice)) { 224 // Add an identifier for all op results and operands. 225 if (!(llvm::all_of(op->getResults(), findOrCreateId) && 226 llvm::all_of(op->getOperands(), findOrCreateId))) 227 return; 228 // Add AffineApplyOps to the constraints. 229 if (auto applyOp = dyn_cast<AffineApplyOp>(op)) { 230 AffineValueMap valueMap(applyOp.getAffineMap(), applyOp.getOperands(), 231 applyOp.getResult()); 232 if (failed(constraints.composeMap(&valueMap))) 233 return; 234 continue; 235 } 236 // Add AffineMinOps to the constraints. 237 auto minOp = cast<AffineMinOp>(op); 238 AffineMap map = constraints.computeAlignedMap(minOp.getAffineMap(), 239 minOp.getOperands()); 240 if (failed(constraints.addBound(FlatAffineConstraints::UB, 241 getPosition(minOp.getResult()), map))) 242 return; 243 } 244 245 // Obtain an upper bound for the affine index computation by projecting out 246 // all temporary results and expressing the upper bound for `value` in terms 247 // of the terminals of the index computation. 248 SmallVector<AffineMap> lowerBounds(1), upperBounds(1); 249 constraints.getSliceBounds(getPosition(value), 1, value.getContext(), 250 &lowerBounds, &upperBounds); 251 252 // Verify `upperBounds[0]` is valid and has at least one result. 253 if (!upperBounds[0] || upperBounds[0].getNumResults() == 0) 254 return; 255 256 // Set `boundMap` and `boundOperands` to the computed upper bound. 257 boundMap = upperBounds[0]; 258 constraints.getAllValues(&boundOperands); 259 erase_value(boundOperands, value); 260 canonicalizeMapAndOperands(&boundMap, &boundOperands); 261 } 262 263 FailureOr<int64_t> getConstantUpperBoundForIndex(Value value) { 264 // Compute an upper bound for `value`. 265 AffineMap boundMap; 266 SmallVector<Value> boundOperands; 267 getUpperBoundForIndex(value, boundMap, boundOperands); 268 269 // Search the results of `boundMap` for constant upper bounds. 270 SmallVector<int64_t> constantBounds; 271 for (AffineExpr result : boundMap.getResults()) 272 if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) 273 constantBounds.push_back(constExpr.getValue()); 274 275 // Return the minimal upper bound or failure if none is found. 276 if (constantBounds.empty()) 277 return failure(); 278 return *std::min_element(constantBounds.begin(), constantBounds.end()); 279 } 280 281 tensor::ExtractSliceOp makeComposedExtractSliceOp( 282 OpBuilder &b, Location loc, Value source, ArrayRef<OpFoldResult> offsets, 283 ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides) { 284 assert(source && "expect source to be nonzero"); 285 286 // Do not fold if the producer is not an ExtractSliceOp. 287 auto producerOp = source.getDefiningOp<tensor::ExtractSliceOp>(); 288 if (!producerOp) 289 return b.create<tensor::ExtractSliceOp>(loc, source, offsets, sizes, 290 strides); 291 292 // Do not fold if the producer is rank reducing or if there are any non-unit 293 // strides. Supporting non-unit strides complicates the offset computation 294 // since the consumer offsets need to be multiplied by the producer strides. 295 // TODO: support non-unit strides once there are use cases. 296 SmallVector<OpFoldResult> allStrides = producerOp.getMixedStrides(); 297 allStrides.append(strides.begin(), strides.end()); 298 bool hasNonUnitStride = any_of(allStrides, [](OpFoldResult ofr) { 299 return getConstantIntValue(ofr) != static_cast<int64_t>(1); 300 }); 301 if (hasNonUnitStride || 302 producerOp.getSourceType().getRank() != 303 producerOp.getResult().getType().cast<ShapedType>().getRank()) 304 return b.create<tensor::ExtractSliceOp>(loc, source, offsets, sizes, 305 strides); 306 307 // Fold the producer by adding the offests and extracting the slice directly 308 // from the producer source tensor. 309 SmallVector<OpFoldResult> foldedOffsets(offsets.begin(), offsets.end()); 310 AffineExpr dim1, dim2; 311 bindDims(b.getContext(), dim1, dim2); 312 for (const auto &en : enumerate(producerOp.getMixedOffsets())) { 313 SmallVector<Value> offsetValues = { 314 getValueOrCreateConstantIndexOp(b, loc, foldedOffsets[en.index()]), 315 getValueOrCreateConstantIndexOp(b, loc, en.value())}; 316 foldedOffsets[en.index()] = 317 makeComposedAffineApply(b, loc, dim1 + dim2, offsetValues).getResult(); 318 } 319 return b.create<tensor::ExtractSliceOp>(loc, producerOp.source(), 320 foldedOffsets, sizes, strides); 321 } 322 323 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type, 324 Value source, Value pad, bool nofold) { 325 assert(type.hasStaticShape() && "expect tensor type to have static shape"); 326 327 // Exit if `source` is not defined by an ExtractSliceOp. 328 auto sliceOp = source.getDefiningOp<tensor::ExtractSliceOp>(); 329 if (!sliceOp) 330 return tensor::createPadHighOp(type, source, pad, nofold, loc, b); 331 332 // Search the `source` use-def chain for padded LinalgOps. 333 Value current = sliceOp.source(); 334 while (current) { 335 auto linalgOp = current.getDefiningOp<LinalgOp>(); 336 if (!linalgOp) 337 break; 338 OpResult opResult = current.cast<OpResult>(); 339 current = linalgOp.getOutputOperand(opResult.getResultNumber())->get(); 340 } 341 auto padOp = current ? current.getDefiningOp<tensor::PadOp>() : nullptr; 342 343 // Exit if the search fails to match a tensor::PadOp at the end of the matched 344 // LinalgOp sequence. 345 if (!padOp) 346 return tensor::createPadHighOp(type, source, pad, nofold, loc, b); 347 348 // Exit if the padded result type does not match. 349 if (sliceOp.source().getType() != type) 350 return tensor::createPadHighOp(type, source, pad, nofold, loc, b); 351 352 // Exit if the LinalgOps are not high padded. 353 if (llvm::any_of(padOp.getMixedLowPad(), [](OpFoldResult ofr) { 354 return getConstantIntValue(ofr) != static_cast<int64_t>(0); 355 })) 356 return tensor::createPadHighOp(type, source, pad, nofold, loc, b); 357 358 // Exit if `padOpSliceOp`, which defines the slice used by 359 // `padOp`, is rank-reducing. 360 auto padOpSliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>(); 361 if (!padOpSliceOp || 362 sliceOp.getMixedSizes().size() != padOpSliceOp.getMixedSizes().size()) 363 return tensor::createPadHighOp(type, source, pad, nofold, loc, b); 364 365 // Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size 366 // of the slice padded by `padOp`. 367 if (llvm::any_of( 368 llvm::zip(sliceOp.getMixedSizes(), padOpSliceOp.getMixedSizes()), 369 [](std::tuple<OpFoldResult, OpFoldResult> it) { 370 return !isEqualConstantIntOrValue(std::get<0>(it), std::get<1>(it)); 371 })) 372 return tensor::createPadHighOp(type, source, pad, nofold, loc, b); 373 374 // Exit if the padding values do not match. 375 Attribute padOpPadAttr, padAttr; 376 Value padOpPad = padOp.getConstantPaddingValue(); 377 if (!padOpPad || !matchPattern(padOpPad, m_Constant(&padOpPadAttr)) || 378 !matchPattern(pad, m_Constant(&padAttr)) || padOpPadAttr != padAttr) 379 return tensor::createPadHighOp(type, source, pad, nofold, loc, b); 380 381 // Return the padded result if the padding values and sizes match. 382 return sliceOp.source(); 383 } 384 385 GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor, 386 Value outputTensor, 387 ArrayRef<int64_t> transposeVector) { 388 auto resultTensorType = outputTensor.getType().cast<RankedTensorType>(); 389 Type elementType = resultTensorType.getElementType(); 390 391 assert(isPermutation(transposeVector) && 392 "expect transpose vector to be a permutation"); 393 assert(transposeVector.size() == 394 static_cast<size_t>(resultTensorType.getRank()) && 395 "expect transpose vector size to match result tensor rank"); 396 397 // Compute the transpose and the indentity indexing maps. 398 SmallVector<AffineMap> indexingMaps = { 399 inversePermutation(AffineMap::getPermutationMap( 400 SmallVector<unsigned>(transposeVector.begin(), transposeVector.end()), 401 b.getContext())), 402 AffineMap::getMultiDimIdentityMap(transposeVector.size(), 403 b.getContext())}; 404 SmallVector<llvm::StringRef> iteratorTypes(transposeVector.size(), 405 getParallelIteratorTypeName()); 406 407 // Create a GenericOp to transpose `inputTensor` into `outputTensor`. 408 auto transposeOp = b.create<GenericOp>( 409 loc, resultTensorType, inputTensor, outputTensor, 410 b.getAffineMapArrayAttr(indexingMaps), b.getStrArrayAttr(iteratorTypes), 411 /*doc=*/nullptr, 412 /*library_call=*/nullptr); 413 Region &body = transposeOp.getRegion(); 414 body.push_back(new Block()); 415 body.front().addArguments({elementType, elementType}, {loc, loc}); 416 417 // Create the body of the transpose operation. 418 OpBuilder::InsertionGuard g(b); 419 b.setInsertionPointToEnd(&body.front()); 420 b.create<YieldOp>(loc, transposeOp.getRegion().front().getArgument(0)); 421 return transposeOp; 422 } 423 424 GenericOp makeMemRefCopyOp(OpBuilder &b, Location loc, Value from, Value to) { 425 auto memrefTypeTo = to.getType().cast<MemRefType>(); 426 #ifndef NDEBUG 427 auto memrefTypeFrom = from.getType().cast<MemRefType>(); 428 assert(memrefTypeFrom.getRank() == memrefTypeTo.getRank() && 429 "`from` and `to` memref must have the same rank"); 430 #endif // NDEBUG 431 432 AffineMap id = 433 AffineMap::getMultiDimIdentityMap(memrefTypeTo.getRank(), b.getContext()); 434 SmallVector<StringRef> iteratorTypes(memrefTypeTo.getRank(), 435 getParallelIteratorTypeName()); 436 return b.create<linalg::GenericOp>( 437 loc, 438 /*inputs=*/from, 439 /*outputs=*/to, 440 /*indexingMaps=*/llvm::makeArrayRef({id, id}), 441 /*iteratorTypes=*/iteratorTypes, 442 [](OpBuilder &b, Location loc, ValueRange args) { 443 b.create<linalg::YieldOp>(loc, args.front()); 444 }); 445 } 446 447 /// Specialization to build an scf "for" nest. 448 template <> 449 void GenerateLoopNest<scf::ForOp>::doit( 450 OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp, 451 ArrayRef<Attribute> iteratorTypes, 452 function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange, 453 ValueRange)> 454 bodyBuilderFn, 455 Optional<LinalgLoopDistributionOptions> distributionOptions, 456 ArrayRef<StringRef> distributionTypes) { 457 SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands(); 458 // Create procInfo so it dominates loops, if appropriate. 459 SmallVector<ProcInfo, 4> procInfo; 460 SmallVector<DistributionMethod, 0> distributionMethod; 461 if (distributionOptions.hasValue()) { 462 // Collect loop ranges for parallel dimensions. 463 SmallVector<Range, 2> parallelLoopRanges; 464 for (const auto &iteratorType : enumerate(iteratorTypes)) 465 if (isParallelIterator(iteratorType.value())) 466 parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); 467 468 // Get their distribution schemes. 469 distributionMethod = distributionOptions->distributionMethod; 470 if (distributionMethod.size() < parallelLoopRanges.size()) 471 parallelLoopRanges.resize(distributionMethod.size()); 472 procInfo = distributionOptions->procInfo(b, loc, parallelLoopRanges); 473 } 474 475 SmallVector<Value, 4> lbs, ubs, steps; 476 unpackRanges(loopRanges, lbs, ubs, steps); 477 LoopNest loopNest = mlir::scf::buildLoopNest( 478 b, loc, lbs, ubs, steps, iterArgInitValues, 479 [&](OpBuilder &b, Location loc, ValueRange ivs, ValueRange iterArgs) { 480 assert(iterArgs.size() == linalgOp.getOutputTensorOperands().size() && 481 "expect the number of output tensors and iter args to match"); 482 SmallVector<Value> operandValuesToUse = 483 linalgOp.getInputAndOutputOperands(); 484 if (!iterArgs.empty()) { 485 operandValuesToUse = linalgOp.getInputOperands(); 486 operandValuesToUse.append(iterArgs.begin(), iterArgs.end()); 487 } 488 return bodyBuilderFn(b, loc, ivs, operandValuesToUse); 489 }); 490 491 if (!distributionOptions || loopNest.loops.empty()) 492 return; 493 494 // Filter out scf.for loops that were created out of parallel dimensions. 495 SmallVector<scf::ForOp, 4> loops; 496 for (const auto &iteratorType : enumerate(iteratorTypes)) 497 if (isParallelIterator(iteratorType.value())) 498 loops.push_back(loopNest.loops[iteratorType.index()]); 499 500 // Distribute - only supports cyclic distribution for now. 501 for (auto it : llvm::zip(loops, procInfo, distributionMethod)) 502 if (std::get<2>(it) == DistributionMethod::Cyclic) 503 mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId, 504 std::get<1>(it).nprocs); 505 } 506 507 /// Specialization to build affine "for" nest. 508 template <> 509 void GenerateLoopNest<AffineForOp>::doit( 510 OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp, 511 ArrayRef<Attribute> iteratorTypes, 512 function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange, 513 ValueRange)> 514 bodyBuilderFn, 515 Optional<LinalgLoopDistributionOptions>, ArrayRef<StringRef>) { 516 SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands(); 517 assert(iterArgInitValues.empty() && "unexpected AffineForOp init values"); 518 SmallVector<Value, 4> lbs, ubs, steps; 519 unpackRanges(loopRanges, lbs, ubs, steps); 520 521 // Affine loops require constant steps. 522 SmallVector<int64_t, 4> constantSteps; 523 constantSteps.reserve(steps.size()); 524 for (Value v : steps) { 525 auto op = v.getDefiningOp<arith::ConstantIndexOp>(); 526 assert(op && "Affine loops require constant steps"); 527 constantSteps.push_back(op.value()); 528 } 529 530 mlir::buildAffineLoopNest(b, loc, lbs, ubs, constantSteps, 531 [&](OpBuilder &b, Location loc, ValueRange ivs) { 532 SmallVector<Value> operandValuesToUse = 533 linalgOp.getInputAndOutputOperands(); 534 bodyBuilderFn(b, loc, ivs, operandValuesToUse); 535 }); 536 } 537 538 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`. 539 void updateBoundsForCyclicDistribution(OpBuilder &b, Location loc, Value procId, 540 Value nprocs, Value &lb, Value &ub, 541 Value &step) { 542 AffineExpr d0, d1; 543 bindDims(b.getContext(), d0, d1); 544 AffineExpr s0 = getAffineSymbolExpr(0, b.getContext()); 545 lb = makeComposedAffineApply(b, loc, d0 + d1 * s0, {lb, procId, step}); 546 step = makeComposedAffineApply(b, loc, d0 * s0, {nprocs, step}); 547 } 548 549 /// Generates a loop nest consisting of scf.parallel and scf.for, depending 550 /// on the `iteratorTypes.` Consecutive parallel loops create a single 551 /// scf.parallel operation; each sequential loop creates a new scf.for 552 /// operation. The body of the innermost loop is populated by 553 /// `bodyBuilderFn` that accepts a range of induction variables for all 554 /// loops. `ivStorage` is used to store the partial list of induction 555 /// variables. 556 // TODO: this function can be made iterative instead. However, it 557 // will have at most as many recursive calls as nested loops, which rarely 558 // exceeds 10. 559 static void generateParallelLoopNest( 560 OpBuilder &b, Location loc, ValueRange lbs, ValueRange ubs, 561 ValueRange steps, ArrayRef<Attribute> iteratorTypes, 562 function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilderFn, 563 SmallVectorImpl<Value> &ivStorage, 564 ArrayRef<DistributionMethod> distributionMethod = {}) { 565 assert(lbs.size() == ubs.size()); 566 assert(lbs.size() == steps.size()); 567 assert(lbs.size() == iteratorTypes.size()); 568 569 // If there are no (more) loops to be generated, generate the body and be 570 // done with it. 571 if (iteratorTypes.empty()) { 572 bodyBuilderFn(b, loc, ivStorage); 573 return; 574 } 575 576 // Find the outermost parallel loops and drop their types from the list. 577 unsigned nLoops = iteratorTypes.size(); 578 unsigned nOuterPar = 579 nLoops - iteratorTypes.drop_while(isParallelIterator).size(); 580 581 // If there are no outer parallel loops, generate one sequential loop and 582 // recurse. Note that we wouldn't have dropped anything from `iteratorTypes` 583 // in this case. 584 if (nOuterPar == 0) { 585 LoopNest singleLoop = buildLoopNest( 586 b, loc, lbs.take_front(), ubs.take_front(), steps.take_front(), 587 [&](OpBuilder &b, Location loc, ValueRange ivs) { 588 ivStorage.append(ivs.begin(), ivs.end()); 589 generateParallelLoopNest(b, loc, lbs.drop_front(), ubs.drop_front(), 590 steps.drop_front(), 591 iteratorTypes.drop_front(), bodyBuilderFn, 592 ivStorage, distributionMethod); 593 }); 594 return; 595 } 596 if (distributionMethod.empty()) { 597 // Generate a single parallel loop-nest operation for all outermost 598 // parallel loops and recurse. 599 b.create<scf::ParallelOp>( 600 loc, lbs.take_front(nOuterPar), ubs.take_front(nOuterPar), 601 steps.take_front(nOuterPar), 602 [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { 603 ivStorage.append(localIvs.begin(), localIvs.end()); 604 generateParallelLoopNest( 605 nestedBuilder, nestedLoc, lbs.drop_front(nOuterPar), 606 ubs.drop_front(nOuterPar), steps.drop_front(nOuterPar), 607 iteratorTypes.drop_front(nOuterPar), bodyBuilderFn, ivStorage, 608 (distributionMethod.size() < nOuterPar) 609 ? ArrayRef<DistributionMethod>() 610 : distributionMethod.drop_front(nOuterPar)); 611 }); 612 return; 613 } 614 615 // Process all consecutive similarly distributed loops simultaneously. 616 DistributionMethod methodToUse = distributionMethod[0]; 617 unsigned numProcessed = 1; 618 for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) { 619 if (distributionMethod[i] != methodToUse) 620 break; 621 numProcessed++; 622 } 623 624 switch (methodToUse) { 625 case DistributionMethod::Cyclic: { 626 // Generate a single parallel loop-nest operation for all outermost 627 // parallel loops and recurse. 628 b.create<scf::ParallelOp>( 629 loc, lbs.take_front(numProcessed), ubs.take_front(numProcessed), 630 steps.take_front(numProcessed), 631 [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { 632 ivStorage.append(localIvs.begin(), localIvs.end()); 633 generateParallelLoopNest( 634 nestedBuilder, nestedLoc, lbs.drop_front(numProcessed), 635 ubs.drop_front(numProcessed), steps.drop_front(numProcessed), 636 iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, 637 (distributionMethod.size() < numProcessed) 638 ? ArrayRef<DistributionMethod>() 639 : distributionMethod.drop_front(numProcessed)); 640 }); 641 return; 642 } 643 case DistributionMethod::CyclicNumProcsGeNumIters: { 644 // Check (for the processed loops) that the iteration is in-bounds. 645 ArithBuilder ab(b, loc); 646 Value cond = ab.slt(lbs[0], ubs[0]); 647 for (unsigned i = 1; i < numProcessed; ++i) 648 cond = ab._and(cond, ab.slt(lbs[i], ubs[i])); 649 ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); 650 b.create<scf::IfOp>(loc, cond, [&](OpBuilder &b, Location loc) { 651 generateParallelLoopNest( 652 b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), 653 steps.drop_front(numProcessed), 654 iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, 655 distributionMethod.drop_front(numProcessed)); 656 b.create<scf::YieldOp>(loc, ValueRange{}); 657 }); 658 return; 659 } 660 case DistributionMethod::CyclicNumProcsEqNumIters: 661 // No check/loops needed here. Set the `%iv` to be the `%lb` and proceed 662 // with inner loop generation. 663 ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); 664 generateParallelLoopNest( 665 b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), 666 steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed), 667 bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed)); 668 return; 669 } 670 } 671 672 /// Specialization for generating a mix of parallel and sequential scf loops. 673 template <> 674 void GenerateLoopNest<scf::ParallelOp>::doit( 675 OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp, 676 ArrayRef<Attribute> iteratorTypes, 677 function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange, 678 ValueRange)> 679 bodyBuilderFn, 680 Optional<LinalgLoopDistributionOptions> distributionOptions, 681 ArrayRef<StringRef> distributionTypes) { 682 SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands(); 683 assert(iterArgInitValues.empty() && "unexpected ParallelOp init values"); 684 // This function may be passed more iterator types than ranges. 685 assert(iteratorTypes.size() >= loopRanges.size() && 686 "expected iterator type for all ranges"); 687 iteratorTypes = iteratorTypes.take_front(loopRanges.size()); 688 SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs; 689 unsigned numLoops = iteratorTypes.size(); 690 ivs.reserve(numLoops); 691 lbsStorage.reserve(numLoops); 692 ubsStorage.reserve(numLoops); 693 stepsStorage.reserve(numLoops); 694 695 // Get the loop lb, ub, and step. 696 unpackRanges(loopRanges, lbsStorage, ubsStorage, stepsStorage); 697 698 // Modify the lb, ub, and step based on the distribution options. 699 SmallVector<DistributionMethod, 0> distributionMethod; 700 if (distributionOptions) { 701 auto &options = distributionOptions.getValue(); 702 distributionMethod.assign(distributionOptions->distributionMethod.begin(), 703 distributionOptions->distributionMethod.end()); 704 SmallVector<Range, 2> parallelLoopRanges; 705 for (const auto &iteratorType : enumerate(iteratorTypes)) { 706 if (isParallelIterator(iteratorType.value())) 707 parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); 708 } 709 if (distributionMethod.size() < parallelLoopRanges.size()) 710 parallelLoopRanges.resize(distributionMethod.size()); 711 SmallVector<ProcInfo, 2> procInfo = 712 options.procInfo(b, loc, parallelLoopRanges); 713 unsigned index = 0; 714 for (const auto &iteratorType : enumerate(iteratorTypes)) { 715 if (index >= procInfo.size()) 716 break; 717 if (isParallelIterator(iteratorType.value())) { 718 unsigned i = iteratorType.index(); 719 updateBoundsForCyclicDistribution(b, loc, procInfo[index].procId, 720 procInfo[index].nprocs, lbsStorage[i], 721 ubsStorage[i], stepsStorage[i]); 722 index++; 723 } 724 } 725 } 726 ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage); 727 generateParallelLoopNest( 728 b, loc, lbs, ubs, steps, iteratorTypes, 729 [&](OpBuilder &b, Location loc, ValueRange ivs) { 730 SmallVector<Value> operandValuesToUse = 731 linalgOp.getInputAndOutputOperands(); 732 bodyBuilderFn(b, loc, ivs, operandValuesToUse); 733 }, 734 ivs, distributionMethod); 735 736 assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops"); 737 } 738 739 static Value fullyComposeAndAffineApply(OpBuilder &b, Location loc, 740 AffineExpr expr, ValueRange operands) { 741 AffineMap map = AffineMap::inferFromExprList({expr}).front(); 742 SmallVector<Value> normalizedOperands(operands.begin(), operands.end()); 743 mlir::fullyComposeAffineMapAndOperands(&map, &normalizedOperands); 744 canonicalizeMapAndOperands(&map, &normalizedOperands); 745 return b.createOrFold<AffineApplyOp>(loc, map, normalizedOperands); 746 } 747 748 Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile, 749 ValueRange tileSizes, AffineMap map, ValueRange lbs, 750 ValueRange ubs, ValueRange subShapeSizes) { 751 auto shapedType = valueToTile.getType().dyn_cast<ShapedType>(); 752 assert(shapedType && "only shaped types can be tiled"); 753 ArrayRef<int64_t> shape = shapedType.getShape(); 754 int64_t rank = shapedType.getRank(); 755 756 // Construct a new subview / extract_slice for the tile. 757 SmallVector<OpFoldResult, 4> offsets, sizes, strides; 758 offsets.reserve(rank); 759 sizes.reserve(rank); 760 strides.reserve(rank); 761 for (unsigned r = 0; r < rank; ++r) { 762 LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: for dim#" << r); 763 if (!isTiled(map.getSubMap({r}), tileSizes)) { 764 offsets.push_back(builder.getIndexAttr(0)); 765 Value dim = createOrFoldDimOp(builder, loc, valueToTile, r); 766 sizes.push_back(getAsOpFoldResult(dim)); 767 strides.push_back(builder.getIndexAttr(1)); 768 LLVM_DEBUG(llvm::dbgs() << ": not tiled: use size: " << dim << "\n"); 769 continue; 770 } 771 LLVM_DEBUG(llvm::dbgs() << ": tiled: figure out subsize...\n"); 772 773 // Tiling creates a new slice at the proper index, the slice step is 1 774 // (i.e. the op does not subsample, stepping occurs in the loop). 775 auto m = map.getSubMap({r}); 776 LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: submap: " << m << "\n"); 777 auto offset = applyMapToValues(builder, loc, m, lbs).front(); 778 offsets.push_back(offset); 779 auto closedIntSize = 780 applyMapToValues(builder, loc, m, subShapeSizes).front(); 781 // Resulting size needs to be made half open interval again. 782 AffineExpr s0 = getAffineSymbolExpr(0, builder.getContext()); 783 Value size = 784 fullyComposeAndAffineApply(builder, loc, s0 + 1, closedIntSize); 785 LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: raw size: " << size << "\n"); 786 787 // The size of the subview / extract_slice should be trimmed to avoid 788 // out-of-bounds accesses, unless: 789 // a. We statically know the subshape size divides the shape size evenly. 790 // b. The subshape size is 1. According to the way the loops are set up, 791 // tensors with "0" dimensions would never be constructed. 792 int64_t shapeSize = shape[r]; 793 auto sizeCst = size.getDefiningOp<arith::ConstantIndexOp>(); 794 auto hasTileSizeOne = sizeCst && sizeCst.value() == 1; 795 auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) && 796 ((shapeSize % sizeCst.value()) == 0); 797 if (!hasTileSizeOne && !dividesEvenly) { 798 LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: shapeSize=" << shapeSize 799 << ", size: " << size 800 << ": make sure in bound with affine.min\n"); 801 802 AffineExpr dim0, dim1, dim2; 803 bindDims(builder.getContext(), dim0, dim1, dim2); 804 805 // Get the dimension size for this dimension. We need to first calculate 806 // the max index and then plus one. This is important because for 807 // convolution ops, we have its input window dimension's affine map of the 808 // form `(d0 * s0 + d1)`, where `d0`/`d1 is an output/filter window 809 // dimension and `s0` is stride. Directly use the dimension size of 810 // output/filer window dimensions will cause incorrect calculation. 811 AffineMap minusOneMap = 812 AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 - 1}}) 813 .front(); 814 AffineMap plusOneMap = 815 AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 + 1}}) 816 .front(); 817 auto maxIndices = llvm::to_vector<8>(llvm::map_range(ubs, [&](Value ub) { 818 return makeComposedAffineApply(builder, loc, minusOneMap, {ub}) 819 .getResult(); 820 })); 821 Value maxIndex = applyMapToValues(builder, loc, m, maxIndices).front(); 822 Value d = makeComposedAffineApply(builder, loc, plusOneMap, {maxIndex}); 823 824 // Compute min(size, dim - offset) to avoid out-of-bounds accesses. 825 AffineMap minMap = AffineMap::inferFromExprList( 826 {ArrayRef<AffineExpr>{dim0, dim1 - dim2}}) 827 .front(); 828 SmallVector<Value, 4> operands{size, d, offset}; 829 fullyComposeAffineMapAndOperands(&minMap, &operands); 830 canonicalizeMapAndOperands(&minMap, &operands); 831 size = builder.create<AffineMinOp>(loc, builder.getIndexType(), minMap, 832 operands); 833 } 834 835 sizes.push_back(size); 836 LLVM_DEBUG(llvm::dbgs() 837 << "makeTiledShape: new offset: " << offset << "\n"); 838 LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: new size: " << size << "\n"); 839 strides.push_back(builder.getIndexAttr(1)); 840 } 841 842 auto *sliceOp = TypeSwitch<ShapedType, Operation *>(shapedType) 843 .Case([&](MemRefType) { 844 return builder.create<memref::SubViewOp>( 845 loc, valueToTile, offsets, sizes, strides); 846 }) 847 .Case([&](RankedTensorType) { 848 return makeComposedExtractSliceOp( 849 builder, loc, valueToTile, offsets, sizes, strides); 850 }) 851 .Default([](ShapedType) -> Operation * { 852 llvm_unreachable("Unexpected shaped type"); 853 }); 854 return sliceOp->getResult(0); 855 } 856 857 SmallVector<Value> computeTileOffsets(OpBuilder &b, Location loc, 858 ValueRange ivs, ValueRange tileSizes) { 859 SmallVector<Value> offsets; 860 for (unsigned idx = 0, idxIvs = 0, e = tileSizes.size(); idx < e; ++idx) { 861 LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: for loop#" << idx << "\n"); 862 bool isTiled = !isZero(tileSizes[idx]); 863 offsets.push_back( 864 isTiled ? ivs[idxIvs++] 865 : b.create<arith::ConstantIndexOp>(loc, 0).getResult()); 866 LLVM_DEBUG(llvm::dbgs() 867 << "computeTileOffsets: " << offsets.back() << "\n"); 868 } 869 return offsets; 870 } 871 872 SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc, ValueRange ivs, 873 ValueRange tileSizes, 874 ArrayRef<Value> sizeBounds) { 875 SmallVector<Value> sizes; 876 for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) { 877 bool isTiled = !isZero(tileSizes[idx]); 878 // Before composing, we need to make range a closed interval. 879 Value size = isTiled ? tileSizes[idx] : sizeBounds[idx]; 880 AffineExpr d0 = getAffineDimExpr(0, b.getContext()); 881 sizes.push_back(fullyComposeAndAffineApply(b, loc, d0 - 1, size)); 882 LLVM_DEBUG(llvm::dbgs() << "computeTileSizes: " << sizes.back() << "\n"); 883 } 884 return sizes; 885 } 886 887 SmallVector<Value, 4> makeTiledShapes(OpBuilder &b, Location loc, 888 LinalgOp linalgOp, 889 ArrayRef<Value> valuesToTile, 890 ValueRange ivs, ValueRange tileSizes, 891 ArrayRef<Value> sizeBounds) { 892 assert(ivs.size() == static_cast<size_t>(llvm::count_if( 893 llvm::make_range(tileSizes.begin(), tileSizes.end()), 894 [](Value v) { return !isZero(v); })) && 895 "expected as many ivs as non-zero sizes"); 896 897 // Construct (potentially temporary) mins and maxes on which to apply maps 898 // that define tile subshapes. 899 SmallVector<Value> lbs = computeTileOffsets(b, loc, ivs, tileSizes); 900 SmallVector<Value> subShapeSizes = 901 computeTileSizes(b, loc, ivs, tileSizes, sizeBounds); 902 903 assert(static_cast<int64_t>(valuesToTile.size()) == 904 linalgOp.getNumInputsAndOutputs() && 905 "expected one value to tile for every operand"); 906 SmallVector<Value, 4> tiledShapes; 907 tiledShapes.reserve(valuesToTile.size()); 908 for (OpOperand *opOperand : linalgOp.getInputAndOutputOperands()) { 909 Value shapedOp = valuesToTile[opOperand->getOperandNumber()]; 910 LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: for operand " << shapedOp); 911 AffineMap map = linalgOp.getTiedIndexingMap(opOperand); 912 // Use `opOperand` as is if it is not tiled and not an output tensor. Having 913 // an extract/insert slice pair for all output tensors simplifies follow up 914 // transformations such as padding and bufferization since the 915 // extract/insert slice pairs make the accessed iteration argument 916 // subdomains explicit. 917 if (!isTiled(map, tileSizes) && !linalgOp.isOutputTensor(opOperand)) { 918 tiledShapes.push_back(shapedOp); 919 LLVM_DEBUG(llvm::dbgs() << ": not tiled: use shape: " 920 << opOperand->get().getType() << "\n"); 921 continue; 922 } 923 LLVM_DEBUG(llvm::dbgs() << ": tiled: figure out subshape...\n"); 924 925 tiledShapes.push_back(makeTiledShape(b, loc, shapedOp, tileSizes, map, lbs, 926 sizeBounds, subShapeSizes)); 927 } 928 929 return tiledShapes; 930 } 931 932 void addTileLoopIvsToIndexOpResults(OpBuilder &b, LinalgOp tiledOp, 933 ArrayRef<Value> ivs) { 934 if (tiledOp.hasIndexSemantics()) { 935 for (IndexOp indexOp : tiledOp.getBlock()->getOps<IndexOp>()) { 936 if (ivs[indexOp.dim()] == nullptr) 937 continue; 938 OpBuilder::InsertionGuard guard(b); 939 b.setInsertionPointAfter(indexOp); 940 AffineExpr index, offset; 941 bindDims(b.getContext(), index, offset); 942 AffineApplyOp applyOp = makeComposedAffineApply( 943 b, indexOp.getLoc(), index + offset, 944 ValueRange{indexOp.getResult(), ivs[indexOp.dim()]}); 945 indexOp.getResult().replaceAllUsesExcept(applyOp, applyOp); 946 } 947 } 948 } 949 950 } // namespace linalg 951 } // namespace mlir 952