1 //===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements target-dependent lowering of vector transfer operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <type_traits> 14 15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h" 16 17 #include "../PassDetail.h" 18 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h" 19 #include "mlir/Dialect/SCF/EDSC/Builders.h" 20 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h" 21 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" 22 #include "mlir/Dialect/Vector/EDSC/Intrinsics.h" 23 #include "mlir/Dialect/Vector/VectorOps.h" 24 #include "mlir/Dialect/Vector/VectorUtils.h" 25 #include "mlir/IR/AffineExpr.h" 26 #include "mlir/IR/AffineMap.h" 27 #include "mlir/IR/Builders.h" 28 #include "mlir/IR/Matchers.h" 29 #include "mlir/Pass/Pass.h" 30 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 31 #include "mlir/Transforms/Passes.h" 32 33 using namespace mlir; 34 using namespace mlir::edsc; 35 using namespace mlir::edsc::intrinsics; 36 using vector::TransferReadOp; 37 using vector::TransferWriteOp; 38 39 // Return a list of Values that correspond to multiple AffineApplyOp, one for 40 // each result of `map`. Each `expr` in `map` is canonicalized and folded 41 // greedily according to its operands. 42 // TODO: factor out in a common location that both linalg and vector can use. 43 static SmallVector<Value, 4> 44 applyMapToValues(OpBuilder &b, Location loc, AffineMap map, ValueRange values) { 45 SmallVector<Value, 4> res; 46 res.reserve(map.getNumResults()); 47 unsigned numDims = map.getNumDims(), numSym = map.getNumSymbols(); 48 // For each `expr` in `map`, applies the `expr` to the values extracted from 49 // ranges. If the resulting application can be folded into a Value, the 50 // folding occurs eagerly. Otherwise, an affine.apply operation is emitted. 51 for (auto expr : map.getResults()) { 52 AffineMap map = AffineMap::get(numDims, numSym, expr); 53 SmallVector<Value, 4> operands(values.begin(), values.end()); 54 fullyComposeAffineMapAndOperands(&map, &operands); 55 canonicalizeMapAndOperands(&map, &operands); 56 res.push_back(b.createOrFold<AffineApplyOp>(loc, map, operands)); 57 } 58 return res; 59 } 60 61 namespace { 62 /// Helper class captures the common information needed to lower N>1-D vector 63 /// transfer operations (read and write). 64 /// On construction, this class opens an edsc::ScopedContext for simpler IR 65 /// manipulation. 66 /// In pseudo-IR, for an n-D vector_transfer_read such as: 67 /// 68 /// ``` 69 /// vector_transfer_read(%m, %offsets, identity_map, %fill) : 70 /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, 71 /// vector<(major_dims) x (minor_dims) x type> 72 /// ``` 73 /// 74 /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or 75 /// higher). 76 /// 77 /// This is the entry point to emitting pseudo-IR resembling: 78 /// 79 /// ``` 80 /// %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>> 81 /// for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest 82 /// if (any_of(%ivs_major + %offsets, <, major_dims)) { 83 /// %v = vector_transfer_read( 84 /// {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor}, 85 /// %ivs_minor): 86 /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, 87 /// vector<(minor_dims) x type>; 88 /// store(%v, %tmp); 89 /// } else { 90 /// %v = splat(vector<(minor_dims) x type>, %fill) 91 /// store(%v, %tmp, %ivs_major); 92 /// } 93 /// } 94 /// %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>): 95 // vector<(major_dims) x (minor_dims) x type> 96 /// ``` 97 /// 98 template <typename ConcreteOp> 99 class NDTransferOpHelper { 100 public: 101 NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp, 102 const VectorTransferToSCFOptions &options) 103 : rewriter(rewriter), options(options), loc(xferOp.getLoc()), 104 scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp), 105 op(xferOp.getOperation()) { 106 vectorType = xferOp.getVectorType(); 107 // TODO: when we go to k > 1-D vectors adapt minorRank. 108 minorRank = 1; 109 majorRank = vectorType.getRank() - minorRank; 110 leadingRank = xferOp.getLeadingShapedRank(); 111 majorVectorType = 112 VectorType::get(vectorType.getShape().take_front(majorRank), 113 vectorType.getElementType()); 114 minorVectorType = 115 VectorType::get(vectorType.getShape().take_back(minorRank), 116 vectorType.getElementType()); 117 /// Memref of minor vector type is used for individual transfers. 118 memRefMinorVectorType = 119 MemRefType::get(majorVectorType.getShape(), minorVectorType, {}, 120 xferOp.getShapedType() 121 .template cast<MemRefType>() 122 .getMemorySpaceAsInt()); 123 } 124 125 LogicalResult doReplace(); 126 127 private: 128 /// Creates the loop nest on the "major" dimensions and calls the 129 /// `loopBodyBuilder` lambda in the context of the loop nest. 130 void 131 emitLoops(llvm::function_ref<void(ValueRange, ValueRange, ValueRange, 132 ValueRange, const MemRefBoundsCapture &)> 133 loopBodyBuilder); 134 135 /// Common state to lower vector transfer ops. 136 PatternRewriter &rewriter; 137 const VectorTransferToSCFOptions &options; 138 Location loc; 139 std::unique_ptr<ScopedContext> scope; 140 ConcreteOp xferOp; 141 Operation *op; 142 // A vector transfer copies data between: 143 // - memref<(leading_dims) x (major_dims) x (minor_dims) x type> 144 // - vector<(major_dims) x (minor_dims) x type> 145 unsigned minorRank; // for now always 1 146 unsigned majorRank; // vector rank - minorRank 147 unsigned leadingRank; // memref rank - vector rank 148 VectorType vectorType; // vector<(major_dims) x (minor_dims) x type> 149 VectorType majorVectorType; // vector<(major_dims) x type> 150 VectorType minorVectorType; // vector<(minor_dims) x type> 151 MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>> 152 }; 153 154 template <typename ConcreteOp> 155 void NDTransferOpHelper<ConcreteOp>::emitLoops( 156 llvm::function_ref<void(ValueRange, ValueRange, ValueRange, ValueRange, 157 const MemRefBoundsCapture &)> 158 loopBodyBuilder) { 159 /// Loop nest operates on the major dimensions 160 MemRefBoundsCapture memrefBoundsCapture(xferOp.source()); 161 162 if (options.unroll) { 163 auto shape = majorVectorType.getShape(); 164 auto strides = computeStrides(shape); 165 unsigned numUnrolledInstances = computeMaxLinearIndex(shape); 166 ValueRange indices(xferOp.indices()); 167 for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) { 168 SmallVector<int64_t, 4> offsets = delinearize(strides, idx); 169 SmallVector<Value, 4> offsetValues = 170 llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value { 171 return std_constant_index(off); 172 })); 173 loopBodyBuilder(offsetValues, indices.take_front(leadingRank), 174 indices.drop_front(leadingRank).take_front(majorRank), 175 indices.take_back(minorRank), memrefBoundsCapture); 176 } 177 } else { 178 VectorBoundsCapture vectorBoundsCapture(majorVectorType); 179 auto majorLbs = vectorBoundsCapture.getLbs(); 180 auto majorUbs = vectorBoundsCapture.getUbs(); 181 auto majorSteps = vectorBoundsCapture.getSteps(); 182 affineLoopNestBuilder( 183 majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) { 184 ValueRange indices(xferOp.indices()); 185 loopBodyBuilder(majorIvs, indices.take_front(leadingRank), 186 indices.drop_front(leadingRank).take_front(majorRank), 187 indices.take_back(minorRank), memrefBoundsCapture); 188 }); 189 } 190 } 191 192 static Optional<int64_t> extractConstantIndex(Value v) { 193 if (auto cstOp = v.getDefiningOp<ConstantIndexOp>()) 194 return cstOp.getValue(); 195 if (auto affineApplyOp = v.getDefiningOp<AffineApplyOp>()) 196 if (affineApplyOp.getAffineMap().isSingleConstant()) 197 return affineApplyOp.getAffineMap().getSingleConstantResult(); 198 return None; 199 } 200 201 // Missing foldings of scf.if make it necessary to perform poor man's folding 202 // eagerly, especially in the case of unrolling. In the future, this should go 203 // away once scf.if folds properly. 204 static Value onTheFlyFoldSLT(Value v, Value ub) { 205 using namespace mlir::edsc::op; 206 auto maybeCstV = extractConstantIndex(v); 207 auto maybeCstUb = extractConstantIndex(ub); 208 if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb) 209 return Value(); 210 return slt(v, ub); 211 } 212 213 /// 1. Compute the indexings `majorIvs + majorOffsets` and save them in 214 /// `majorIvsPlusOffsets`. 215 /// 2. Return a value of i1 that determines whether the first 216 /// `majorIvs.rank()` 217 /// dimensions `majorIvs + majorOffsets` are all within `memrefBounds`. 218 static Value 219 emitInBoundsCondition(PatternRewriter &rewriter, 220 VectorTransferOpInterface xferOp, unsigned leadingRank, 221 ValueRange majorIvs, ValueRange majorOffsets, 222 const MemRefBoundsCapture &memrefBounds, 223 SmallVectorImpl<Value> &majorIvsPlusOffsets) { 224 Value inBoundsCondition; 225 majorIvsPlusOffsets.reserve(majorIvs.size()); 226 unsigned idx = 0; 227 SmallVector<Value, 4> bounds = 228 applyMapToValues(rewriter, xferOp.getLoc(), xferOp.permutation_map(), 229 memrefBounds.getUbs()); 230 for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) { 231 Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it); 232 using namespace mlir::edsc::op; 233 majorIvsPlusOffsets.push_back(iv + off); 234 if (xferOp.isMaskedDim(leadingRank + idx)) { 235 Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub); 236 if (inBoundsCond) 237 inBoundsCondition = (inBoundsCondition) 238 ? (inBoundsCondition && inBoundsCond) 239 : inBoundsCond; 240 } 241 ++idx; 242 } 243 return inBoundsCondition; 244 } 245 246 // TODO: Parallelism and threadlocal considerations. 247 static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType, 248 Operation *op) { 249 auto &b = ScopedContext::getBuilderRef(); 250 OpBuilder::InsertionGuard guard(b); 251 Operation *scope = 252 op->getParentWithTrait<OpTrait::AutomaticAllocationScope>(); 253 assert(scope && "Expected op to be inside automatic allocation scope"); 254 b.setInsertionPointToStart(&scope->getRegion(0).front()); 255 Value res = std_alloca(memRefMinorVectorType); 256 return res; 257 } 258 259 template <> 260 LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() { 261 Value alloc, result; 262 if (options.unroll) 263 result = std_splat(vectorType, xferOp.padding()); 264 else 265 alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); 266 267 emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, 268 ValueRange majorOffsets, ValueRange minorOffsets, 269 const MemRefBoundsCapture &memrefBounds) { 270 /// Lambda to load 1-D vector in the current loop ivs + offset context. 271 auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value { 272 SmallVector<Value, 8> indexing; 273 indexing.reserve(leadingRank + majorRank + minorRank); 274 indexing.append(leadingOffsets.begin(), leadingOffsets.end()); 275 indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); 276 indexing.append(minorOffsets.begin(), minorOffsets.end()); 277 Value memref = xferOp.source(); 278 auto map = 279 getTransferMinorIdentityMap(xferOp.getShapedType(), minorVectorType); 280 ArrayAttr masked; 281 if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) { 282 OpBuilder &b = ScopedContext::getBuilderRef(); 283 masked = b.getBoolArrayAttr({false}); 284 } 285 return vector_transfer_read(minorVectorType, memref, indexing, 286 AffineMapAttr::get(map), xferOp.padding(), 287 masked); 288 }; 289 290 // 1. Compute the inBoundsCondition in the current loops ivs + offset 291 // context. 292 SmallVector<Value, 4> majorIvsPlusOffsets; 293 Value inBoundsCondition = emitInBoundsCondition( 294 rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()), 295 leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); 296 297 if (inBoundsCondition) { 298 // 2. If the condition is not null, we need an IfOp, which may yield 299 // if `options.unroll` is true. 300 SmallVector<Type, 1> resultType; 301 if (options.unroll) 302 resultType.push_back(vectorType); 303 304 // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise 305 // splat a 1-D vector. 306 ValueRange ifResults = conditionBuilder( 307 resultType, inBoundsCondition, 308 [&]() -> scf::ValueVector { 309 Value vector = load1DVector(majorIvsPlusOffsets); 310 // 3.a. If `options.unroll` is true, insert the 1-D vector in the 311 // aggregate. We must yield and merge with the `else` branch. 312 if (options.unroll) { 313 vector = vector_insert(vector, result, majorIvs); 314 return {vector}; 315 } 316 // 3.b. Otherwise, just go through the temporary `alloc`. 317 std_store(vector, alloc, majorIvs); 318 return {}; 319 }, 320 [&]() -> scf::ValueVector { 321 Value vector = std_splat(minorVectorType, xferOp.padding()); 322 // 3.c. If `options.unroll` is true, insert the 1-D vector in the 323 // aggregate. We must yield and merge with the `then` branch. 324 if (options.unroll) { 325 vector = vector_insert(vector, result, majorIvs); 326 return {vector}; 327 } 328 // 3.d. Otherwise, just go through the temporary `alloc`. 329 std_store(vector, alloc, majorIvs); 330 return {}; 331 }); 332 333 if (!resultType.empty()) 334 result = *ifResults.begin(); 335 } else { 336 // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read. 337 Value loaded1D = load1DVector(majorIvsPlusOffsets); 338 // 5.a. If `options.unroll` is true, insert the 1-D vector in the 339 // aggregate. 340 if (options.unroll) 341 result = vector_insert(loaded1D, result, majorIvs); 342 // 5.b. Otherwise, just go through the temporary `alloc`. 343 else 344 std_store(loaded1D, alloc, majorIvs); 345 } 346 }); 347 348 assert((!options.unroll ^ (bool)result) && 349 "Expected resulting Value iff unroll"); 350 if (!result) 351 result = std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc)); 352 rewriter.replaceOp(op, result); 353 354 return success(); 355 } 356 357 template <> 358 LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() { 359 Value alloc; 360 if (!options.unroll) { 361 alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); 362 std_store(xferOp.vector(), 363 vector_type_cast(MemRefType::get({}, vectorType), alloc)); 364 } 365 366 emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, 367 ValueRange majorOffsets, ValueRange minorOffsets, 368 const MemRefBoundsCapture &memrefBounds) { 369 // Lower to 1-D vector_transfer_write and let recursion handle it. 370 auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) { 371 SmallVector<Value, 8> indexing; 372 indexing.reserve(leadingRank + majorRank + minorRank); 373 indexing.append(leadingOffsets.begin(), leadingOffsets.end()); 374 indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); 375 indexing.append(minorOffsets.begin(), minorOffsets.end()); 376 Value result; 377 // If `options.unroll` is true, extract the 1-D vector from the 378 // aggregate. 379 if (options.unroll) 380 result = vector_extract(xferOp.vector(), majorIvs); 381 else 382 result = std_load(alloc, majorIvs); 383 auto map = 384 getTransferMinorIdentityMap(xferOp.getShapedType(), minorVectorType); 385 ArrayAttr masked; 386 if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) { 387 OpBuilder &b = ScopedContext::getBuilderRef(); 388 masked = b.getBoolArrayAttr({false}); 389 } 390 vector_transfer_write(result, xferOp.source(), indexing, 391 AffineMapAttr::get(map), masked); 392 }; 393 394 // 1. Compute the inBoundsCondition in the current loops ivs + offset 395 // context. 396 SmallVector<Value, 4> majorIvsPlusOffsets; 397 Value inBoundsCondition = emitInBoundsCondition( 398 rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()), 399 leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); 400 401 if (inBoundsCondition) { 402 // 2.a. If the condition is not null, we need an IfOp, to write 403 // conditionally. Progressively lower to a 1-D transfer write. 404 conditionBuilder(inBoundsCondition, 405 [&] { emitTransferWrite(majorIvsPlusOffsets); }); 406 } else { 407 // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write. 408 emitTransferWrite(majorIvsPlusOffsets); 409 } 410 }); 411 412 rewriter.eraseOp(op); 413 414 return success(); 415 } 416 417 } // namespace 418 419 /// Analyzes the `transfer` to find an access dimension along the fastest remote 420 /// MemRef dimension. If such a dimension with coalescing properties is found, 421 /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of 422 /// LoopNestBuilder captures it in the innermost loop. 423 template <typename TransferOpTy> 424 static int computeCoalescedIndex(TransferOpTy transfer) { 425 // rank of the remote memory access, coalescing behavior occurs on the 426 // innermost memory dimension. 427 auto remoteRank = transfer.getShapedType().getRank(); 428 // Iterate over the results expressions of the permutation map to determine 429 // the loop order for creating pointwise copies between remote and local 430 // memories. 431 int coalescedIdx = -1; 432 auto exprs = transfer.permutation_map().getResults(); 433 for (auto en : llvm::enumerate(exprs)) { 434 auto dim = en.value().template dyn_cast<AffineDimExpr>(); 435 if (!dim) { 436 continue; 437 } 438 auto memRefDim = dim.getPosition(); 439 if (memRefDim == remoteRank - 1) { 440 // memRefDim has coalescing properties, it should be swapped in the last 441 // position. 442 assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices"); 443 coalescedIdx = en.index(); 444 } 445 } 446 return coalescedIdx; 447 } 448 449 template <typename TransferOpTy> 450 VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter( 451 VectorTransferToSCFOptions options, MLIRContext *context) 452 : RewritePattern(TransferOpTy::getOperationName(), 1, context), 453 options(options) {} 454 455 /// Used for staging the transfer in a local buffer. 456 template <typename TransferOpTy> 457 MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType( 458 TransferOpTy transfer) const { 459 auto vectorType = transfer.getVectorType(); 460 return MemRefType::get(vectorType.getShape().drop_back(), 461 VectorType::get(vectorType.getShape().take_back(), 462 vectorType.getElementType()), 463 {}, 0); 464 } 465 466 static void emitWithBoundsChecks( 467 PatternRewriter &rewriter, VectorTransferOpInterface transfer, 468 ValueRange ivs, const MemRefBoundsCapture &memRefBoundsCapture, 469 function_ref<void(ArrayRef<Value>)> inBoundsFun, 470 function_ref<void(ArrayRef<Value>)> outOfBoundsFun = nullptr) { 471 // Permute the incoming indices according to the permutation map. 472 SmallVector<Value, 4> indices = 473 applyMapToValues(rewriter, transfer.getLoc(), transfer.permutation_map(), 474 transfer.indices()); 475 476 // Generate a bounds check if necessary. 477 SmallVector<Value, 4> majorIvsPlusOffsets; 478 Value inBoundsCondition = 479 emitInBoundsCondition(rewriter, transfer, 0, ivs, indices, 480 memRefBoundsCapture, majorIvsPlusOffsets); 481 482 // Apply the permutation map to the ivs. The permutation map may not use all 483 // the inputs. 484 SmallVector<Value, 4> scalarAccessExprs(transfer.indices().size()); 485 for (unsigned memRefDim = 0; memRefDim < transfer.indices().size(); 486 ++memRefDim) { 487 // Linear search on a small number of entries. 488 int loopIndex = -1; 489 auto exprs = transfer.permutation_map().getResults(); 490 for (auto en : llvm::enumerate(exprs)) { 491 auto expr = en.value(); 492 auto dim = expr.dyn_cast<AffineDimExpr>(); 493 // Sanity check. 494 assert((dim || expr.cast<AffineConstantExpr>().getValue() == 0) && 495 "Expected dim or 0 in permutationMap"); 496 if (dim && memRefDim == dim.getPosition()) { 497 loopIndex = en.index(); 498 break; 499 } 500 } 501 502 using namespace edsc::op; 503 auto i = transfer.indices()[memRefDim]; 504 scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex]; 505 } 506 507 if (inBoundsCondition) 508 conditionBuilder( 509 /* scf.if */ inBoundsCondition, // { 510 [&] { inBoundsFun(scalarAccessExprs); }, 511 // } else { 512 outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); } 513 : function_ref<void()>() 514 // } 515 ); 516 else 517 inBoundsFun(scalarAccessExprs); 518 } 519 520 namespace mlir { 521 522 /// Lowers TransferReadOp into a combination of: 523 /// 1. local memory allocation; 524 /// 2. perfect loop nest over: 525 /// a. scalar load from local buffers (viewed as a scalar memref); 526 /// a. scalar store to original memref (with padding). 527 /// 3. vector_load from local buffer (viewed as a memref<1 x vector>); 528 /// 4. local memory deallocation. 529 /// 530 /// Lowers the data transfer part of a TransferReadOp while ensuring no 531 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by 532 /// padding. 533 534 /// Performs the rewrite. 535 template <> 536 LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite( 537 Operation *op, PatternRewriter &rewriter) const { 538 using namespace mlir::edsc::op; 539 540 TransferReadOp transfer = cast<TransferReadOp>(op); 541 auto memRefType = transfer.getShapedType().dyn_cast<MemRefType>(); 542 if (!memRefType) 543 return failure(); 544 // Fall back to a loop if the fastest varying stride is not 1 or it is 545 // permuted. 546 int64_t offset; 547 SmallVector<int64_t, 4> strides; 548 auto successStrides = getStridesAndOffset(memRefType, strides, offset); 549 if (succeeded(successStrides) && strides.back() == 1 && 550 transfer.permutation_map().isMinorIdentity()) { 551 // If > 1D, emit a bunch of loops around 1-D vector transfers. 552 if (transfer.getVectorType().getRank() > 1) 553 return NDTransferOpHelper<TransferReadOp>(rewriter, transfer, options) 554 .doReplace(); 555 // If 1-D this is now handled by the target-specific lowering. 556 if (transfer.getVectorType().getRank() == 1) 557 return failure(); 558 } 559 560 // Conservative lowering to scalar load / stores. 561 // 1. Setup all the captures. 562 ScopedContext scope(rewriter, transfer.getLoc()); 563 StdIndexedValue remote(transfer.source()); 564 MemRefBoundsCapture memRefBoundsCapture(transfer.source()); 565 VectorBoundsCapture vectorBoundsCapture(transfer.vector()); 566 int coalescedIdx = computeCoalescedIndex(transfer); 567 // Swap the vectorBoundsCapture which will reorder loop bounds. 568 if (coalescedIdx >= 0) 569 vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, 570 coalescedIdx); 571 572 auto lbs = vectorBoundsCapture.getLbs(); 573 auto ubs = vectorBoundsCapture.getUbs(); 574 SmallVector<Value, 8> steps; 575 steps.reserve(vectorBoundsCapture.getSteps().size()); 576 for (auto step : vectorBoundsCapture.getSteps()) 577 steps.push_back(std_constant_index(step)); 578 579 // 2. Emit alloc-copy-load-dealloc. 580 MLIRContext *ctx = op->getContext(); 581 Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); 582 StdIndexedValue local(tmp); 583 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { 584 auto ivsStorage = llvm::to_vector<8>(loopIvs); 585 // Swap the ivs which will reorder memory accesses. 586 if (coalescedIdx >= 0) 587 std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); 588 589 ArrayRef<Value> ivs(ivsStorage); 590 Value pos = std_index_cast(IntegerType::get(ctx, 32), ivs.back()); 591 Value inVector = local(ivs.drop_back()); 592 auto loadValue = [&](ArrayRef<Value> indices) { 593 Value vector = vector_insert_element(remote(indices), inVector, pos); 594 local(ivs.drop_back()) = vector; 595 }; 596 auto loadPadding = [&](ArrayRef<Value>) { 597 Value vector = vector_insert_element(transfer.padding(), inVector, pos); 598 local(ivs.drop_back()) = vector; 599 }; 600 emitWithBoundsChecks( 601 rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs, 602 memRefBoundsCapture, loadValue, loadPadding); 603 }); 604 Value vectorValue = std_load(vector_type_cast(tmp)); 605 606 // 3. Propagate. 607 rewriter.replaceOp(op, vectorValue); 608 return success(); 609 } 610 611 /// Lowers TransferWriteOp into a combination of: 612 /// 1. local memory allocation; 613 /// 2. vector_store to local buffer (viewed as a memref<1 x vector>); 614 /// 3. perfect loop nest over: 615 /// a. scalar load from local buffers (viewed as a scalar memref); 616 /// a. scalar store to original memref (if in bounds). 617 /// 4. local memory deallocation. 618 /// 619 /// More specifically, lowers the data transfer part while ensuring no 620 /// out-of-bounds accesses are possible. 621 template <> 622 LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite( 623 Operation *op, PatternRewriter &rewriter) const { 624 using namespace edsc::op; 625 626 TransferWriteOp transfer = cast<TransferWriteOp>(op); 627 auto memRefType = transfer.getShapedType().template dyn_cast<MemRefType>(); 628 if (!memRefType) 629 return failure(); 630 631 // Fall back to a loop if the fastest varying stride is not 1 or it is 632 // permuted. 633 int64_t offset; 634 SmallVector<int64_t, 4> strides; 635 auto successStrides = getStridesAndOffset(memRefType, strides, offset); 636 if (succeeded(successStrides) && strides.back() == 1 && 637 transfer.permutation_map().isMinorIdentity()) { 638 // If > 1D, emit a bunch of loops around 1-D vector transfers. 639 if (transfer.getVectorType().getRank() > 1) 640 return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer, options) 641 .doReplace(); 642 // If 1-D this is now handled by the target-specific lowering. 643 if (transfer.getVectorType().getRank() == 1) 644 return failure(); 645 } 646 647 // 1. Setup all the captures. 648 ScopedContext scope(rewriter, transfer.getLoc()); 649 StdIndexedValue remote(transfer.source()); 650 MemRefBoundsCapture memRefBoundsCapture(transfer.source()); 651 Value vectorValue(transfer.vector()); 652 VectorBoundsCapture vectorBoundsCapture(transfer.vector()); 653 int coalescedIdx = computeCoalescedIndex(transfer); 654 // Swap the vectorBoundsCapture which will reorder loop bounds. 655 if (coalescedIdx >= 0) 656 vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, 657 coalescedIdx); 658 659 auto lbs = vectorBoundsCapture.getLbs(); 660 auto ubs = vectorBoundsCapture.getUbs(); 661 SmallVector<Value, 8> steps; 662 steps.reserve(vectorBoundsCapture.getSteps().size()); 663 for (auto step : vectorBoundsCapture.getSteps()) 664 steps.push_back(std_constant_index(step)); 665 666 // 2. Emit alloc-store-copy-dealloc. 667 Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); 668 StdIndexedValue local(tmp); 669 Value vec = vector_type_cast(tmp); 670 std_store(vectorValue, vec); 671 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { 672 auto ivsStorage = llvm::to_vector<8>(loopIvs); 673 // Swap the ivsStorage which will reorder memory accesses. 674 if (coalescedIdx >= 0) 675 std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); 676 677 ArrayRef<Value> ivs(ivsStorage); 678 Value pos = 679 std_index_cast(IntegerType::get(op->getContext(), 32), ivs.back()); 680 auto storeValue = [&](ArrayRef<Value> indices) { 681 Value scalar = vector_extract_element(local(ivs.drop_back()), pos); 682 remote(indices) = scalar; 683 }; 684 emitWithBoundsChecks( 685 rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs, 686 memRefBoundsCapture, storeValue); 687 }); 688 689 // 3. Erase. 690 rewriter.eraseOp(op); 691 return success(); 692 } 693 694 void populateVectorToSCFConversionPatterns( 695 OwningRewritePatternList &patterns, MLIRContext *context, 696 const VectorTransferToSCFOptions &options) { 697 patterns.insert<VectorTransferRewriter<vector::TransferReadOp>, 698 VectorTransferRewriter<vector::TransferWriteOp>>(options, 699 context); 700 } 701 702 } // namespace mlir 703 704 namespace { 705 706 struct ConvertVectorToSCFPass 707 : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> { 708 ConvertVectorToSCFPass() = default; 709 ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { 710 this->fullUnroll = options.unroll; 711 } 712 713 void runOnFunction() override { 714 OwningRewritePatternList patterns; 715 auto *context = getFunction().getContext(); 716 populateVectorToSCFConversionPatterns( 717 patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll)); 718 (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns)); 719 } 720 }; 721 722 } // namespace 723 724 std::unique_ptr<Pass> 725 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { 726 return std::make_unique<ConvertVectorToSCFPass>(options); 727 } 728