1 //===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements target-dependent lowering of vector transfer operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <type_traits> 14 15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h" 16 17 #include "../PassDetail.h" 18 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h" 19 #include "mlir/Dialect/MemRef/EDSC/Intrinsics.h" 20 #include "mlir/Dialect/SCF/EDSC/Builders.h" 21 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h" 22 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" 23 #include "mlir/Dialect/Vector/EDSC/Intrinsics.h" 24 #include "mlir/Dialect/Vector/VectorOps.h" 25 #include "mlir/Dialect/Vector/VectorUtils.h" 26 #include "mlir/IR/AffineExpr.h" 27 #include "mlir/IR/AffineMap.h" 28 #include "mlir/IR/Builders.h" 29 #include "mlir/IR/Matchers.h" 30 #include "mlir/Pass/Pass.h" 31 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 32 #include "mlir/Transforms/Passes.h" 33 34 using namespace mlir; 35 using namespace mlir::edsc; 36 using namespace mlir::edsc::intrinsics; 37 using vector::TransferReadOp; 38 using vector::TransferWriteOp; 39 40 // Return a list of Values that correspond to multiple AffineApplyOp, one for 41 // each result of `map`. Each `expr` in `map` is canonicalized and folded 42 // greedily according to its operands. 43 // TODO: factor out in a common location that both linalg and vector can use. 44 static SmallVector<Value, 4> 45 applyMapToValues(OpBuilder &b, Location loc, AffineMap map, ValueRange values) { 46 SmallVector<Value, 4> res; 47 res.reserve(map.getNumResults()); 48 unsigned numDims = map.getNumDims(), numSym = map.getNumSymbols(); 49 // For each `expr` in `map`, applies the `expr` to the values extracted from 50 // ranges. If the resulting application can be folded into a Value, the 51 // folding occurs eagerly. Otherwise, an affine.apply operation is emitted. 52 for (auto expr : map.getResults()) { 53 AffineMap map = AffineMap::get(numDims, numSym, expr); 54 SmallVector<Value, 4> operands(values.begin(), values.end()); 55 fullyComposeAffineMapAndOperands(&map, &operands); 56 canonicalizeMapAndOperands(&map, &operands); 57 res.push_back(b.createOrFold<AffineApplyOp>(loc, map, operands)); 58 } 59 return res; 60 } 61 62 namespace { 63 /// Helper class captures the common information needed to lower N>1-D vector 64 /// transfer operations (read and write). 65 /// On construction, this class opens an edsc::ScopedContext for simpler IR 66 /// manipulation. 67 /// In pseudo-IR, for an n-D vector_transfer_read such as: 68 /// 69 /// ``` 70 /// vector_transfer_read(%m, %offsets, identity_map, %fill) : 71 /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, 72 /// vector<(major_dims) x (minor_dims) x type> 73 /// ``` 74 /// 75 /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or 76 /// higher). 77 /// 78 /// This is the entry point to emitting pseudo-IR resembling: 79 /// 80 /// ``` 81 /// %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>> 82 /// for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest 83 /// if (any_of(%ivs_major + %offsets, <, major_dims)) { 84 /// %v = vector_transfer_read( 85 /// {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor}, 86 /// %ivs_minor): 87 /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, 88 /// vector<(minor_dims) x type>; 89 /// store(%v, %tmp); 90 /// } else { 91 /// %v = splat(vector<(minor_dims) x type>, %fill) 92 /// store(%v, %tmp, %ivs_major); 93 /// } 94 /// } 95 /// %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>): 96 // vector<(major_dims) x (minor_dims) x type> 97 /// ``` 98 /// 99 template <typename ConcreteOp> 100 class NDTransferOpHelper { 101 public: 102 NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp, 103 const VectorTransferToSCFOptions &options) 104 : rewriter(rewriter), options(options), loc(xferOp.getLoc()), 105 scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp), 106 op(xferOp.getOperation()) { 107 vectorType = xferOp.getVectorType(); 108 // TODO: when we go to k > 1-D vectors adapt minorRank. 109 minorRank = 1; 110 majorRank = vectorType.getRank() - minorRank; 111 leadingRank = xferOp.getLeadingShapedRank(); 112 majorVectorType = 113 VectorType::get(vectorType.getShape().take_front(majorRank), 114 vectorType.getElementType()); 115 minorVectorType = 116 VectorType::get(vectorType.getShape().take_back(minorRank), 117 vectorType.getElementType()); 118 /// Memref of minor vector type is used for individual transfers. 119 memRefMinorVectorType = MemRefType::get( 120 majorVectorType.getShape(), minorVectorType, {}, 121 xferOp.getShapedType().template cast<MemRefType>().getMemorySpace()); 122 } 123 124 LogicalResult doReplace(); 125 126 private: 127 /// Creates the loop nest on the "major" dimensions and calls the 128 /// `loopBodyBuilder` lambda in the context of the loop nest. 129 void 130 emitLoops(llvm::function_ref<void(ValueRange, ValueRange, ValueRange, 131 ValueRange, const MemRefBoundsCapture &)> 132 loopBodyBuilder); 133 134 /// Common state to lower vector transfer ops. 135 PatternRewriter &rewriter; 136 const VectorTransferToSCFOptions &options; 137 Location loc; 138 std::unique_ptr<ScopedContext> scope; 139 ConcreteOp xferOp; 140 Operation *op; 141 // A vector transfer copies data between: 142 // - memref<(leading_dims) x (major_dims) x (minor_dims) x type> 143 // - vector<(major_dims) x (minor_dims) x type> 144 unsigned minorRank; // for now always 1 145 unsigned majorRank; // vector rank - minorRank 146 unsigned leadingRank; // memref rank - vector rank 147 VectorType vectorType; // vector<(major_dims) x (minor_dims) x type> 148 VectorType majorVectorType; // vector<(major_dims) x type> 149 VectorType minorVectorType; // vector<(minor_dims) x type> 150 MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>> 151 }; 152 153 template <typename ConcreteOp> 154 void NDTransferOpHelper<ConcreteOp>::emitLoops( 155 llvm::function_ref<void(ValueRange, ValueRange, ValueRange, ValueRange, 156 const MemRefBoundsCapture &)> 157 loopBodyBuilder) { 158 /// Loop nest operates on the major dimensions 159 MemRefBoundsCapture memrefBoundsCapture(xferOp.source()); 160 161 if (options.unroll) { 162 auto shape = majorVectorType.getShape(); 163 auto strides = computeStrides(shape); 164 unsigned numUnrolledInstances = computeMaxLinearIndex(shape); 165 ValueRange indices(xferOp.indices()); 166 for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) { 167 SmallVector<int64_t, 4> offsets = delinearize(strides, idx); 168 SmallVector<Value, 4> offsetValues = 169 llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value { 170 return std_constant_index(off); 171 })); 172 loopBodyBuilder(offsetValues, indices.take_front(leadingRank), 173 indices.drop_front(leadingRank).take_front(majorRank), 174 indices.take_back(minorRank), memrefBoundsCapture); 175 } 176 } else { 177 VectorBoundsCapture vectorBoundsCapture(majorVectorType); 178 auto majorLbs = vectorBoundsCapture.getLbs(); 179 auto majorUbs = vectorBoundsCapture.getUbs(); 180 auto majorSteps = vectorBoundsCapture.getSteps(); 181 affineLoopNestBuilder( 182 majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) { 183 ValueRange indices(xferOp.indices()); 184 loopBodyBuilder(majorIvs, indices.take_front(leadingRank), 185 indices.drop_front(leadingRank).take_front(majorRank), 186 indices.take_back(minorRank), memrefBoundsCapture); 187 }); 188 } 189 } 190 191 static Optional<int64_t> extractConstantIndex(Value v) { 192 if (auto cstOp = v.getDefiningOp<ConstantIndexOp>()) 193 return cstOp.getValue(); 194 if (auto affineApplyOp = v.getDefiningOp<AffineApplyOp>()) 195 if (affineApplyOp.getAffineMap().isSingleConstant()) 196 return affineApplyOp.getAffineMap().getSingleConstantResult(); 197 return None; 198 } 199 200 // Missing foldings of scf.if make it necessary to perform poor man's folding 201 // eagerly, especially in the case of unrolling. In the future, this should go 202 // away once scf.if folds properly. 203 static Value onTheFlyFoldSLT(Value v, Value ub) { 204 using namespace mlir::edsc::op; 205 auto maybeCstV = extractConstantIndex(v); 206 auto maybeCstUb = extractConstantIndex(ub); 207 if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb) 208 return Value(); 209 return slt(v, ub); 210 } 211 212 /// 1. Compute the indexings `majorIvs + majorOffsets` and save them in 213 /// `majorIvsPlusOffsets`. 214 /// 2. Return a value of i1 that determines whether the first 215 /// `majorIvs.rank()` 216 /// dimensions `majorIvs + majorOffsets` are all within `memrefBounds`. 217 static Value 218 emitInBoundsCondition(PatternRewriter &rewriter, 219 VectorTransferOpInterface xferOp, unsigned leadingRank, 220 ValueRange majorIvs, ValueRange majorOffsets, 221 const MemRefBoundsCapture &memrefBounds, 222 SmallVectorImpl<Value> &majorIvsPlusOffsets) { 223 Value inBoundsCondition; 224 majorIvsPlusOffsets.reserve(majorIvs.size()); 225 unsigned idx = 0; 226 SmallVector<Value, 4> bounds = 227 applyMapToValues(rewriter, xferOp.getLoc(), xferOp.permutation_map(), 228 memrefBounds.getUbs()); 229 for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) { 230 Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it); 231 using namespace mlir::edsc::op; 232 majorIvsPlusOffsets.push_back(iv + off); 233 if (!xferOp.isDimInBounds(leadingRank + idx)) { 234 Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub); 235 if (inBoundsCond) 236 inBoundsCondition = (inBoundsCondition) 237 ? (inBoundsCondition && inBoundsCond) 238 : inBoundsCond; 239 } 240 ++idx; 241 } 242 return inBoundsCondition; 243 } 244 245 // TODO: Parallelism and threadlocal considerations. 246 static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType, 247 Operation *op) { 248 auto &b = ScopedContext::getBuilderRef(); 249 OpBuilder::InsertionGuard guard(b); 250 Operation *scope = 251 op->getParentWithTrait<OpTrait::AutomaticAllocationScope>(); 252 assert(scope && "Expected op to be inside automatic allocation scope"); 253 b.setInsertionPointToStart(&scope->getRegion(0).front()); 254 Value res = memref_alloca(memRefMinorVectorType); 255 return res; 256 } 257 258 template <> 259 LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() { 260 Value alloc, result; 261 if (options.unroll) 262 result = std_splat(vectorType, xferOp.padding()); 263 else 264 alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); 265 266 emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, 267 ValueRange majorOffsets, ValueRange minorOffsets, 268 const MemRefBoundsCapture &memrefBounds) { 269 /// Lambda to load 1-D vector in the current loop ivs + offset context. 270 auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value { 271 SmallVector<Value, 8> indexing; 272 indexing.reserve(leadingRank + majorRank + minorRank); 273 indexing.append(leadingOffsets.begin(), leadingOffsets.end()); 274 indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); 275 indexing.append(minorOffsets.begin(), minorOffsets.end()); 276 Value memref = xferOp.source(); 277 auto map = 278 getTransferMinorIdentityMap(xferOp.getShapedType(), minorVectorType); 279 ArrayAttr inBounds; 280 if (xferOp.isDimInBounds(xferOp.getVectorType().getRank() - 1)) { 281 OpBuilder &b = ScopedContext::getBuilderRef(); 282 inBounds = b.getBoolArrayAttr({true}); 283 } 284 return vector_transfer_read(minorVectorType, memref, indexing, 285 AffineMapAttr::get(map), xferOp.padding(), 286 inBounds); 287 }; 288 289 // 1. Compute the inBoundsCondition in the current loops ivs + offset 290 // context. 291 SmallVector<Value, 4> majorIvsPlusOffsets; 292 Value inBoundsCondition = emitInBoundsCondition( 293 rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()), 294 leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); 295 296 if (inBoundsCondition) { 297 // 2. If the condition is not null, we need an IfOp, which may yield 298 // if `options.unroll` is true. 299 SmallVector<Type, 1> resultType; 300 if (options.unroll) 301 resultType.push_back(vectorType); 302 303 // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise 304 // splat a 1-D vector. 305 ValueRange ifResults = conditionBuilder( 306 resultType, inBoundsCondition, 307 [&]() -> scf::ValueVector { 308 Value vector = load1DVector(majorIvsPlusOffsets); 309 // 3.a. If `options.unroll` is true, insert the 1-D vector in the 310 // aggregate. We must yield and merge with the `else` branch. 311 if (options.unroll) { 312 vector = vector_insert(vector, result, majorIvs); 313 return {vector}; 314 } 315 // 3.b. Otherwise, just go through the temporary `alloc`. 316 memref_store(vector, alloc, majorIvs); 317 return {}; 318 }, 319 [&]() -> scf::ValueVector { 320 Value vector = std_splat(minorVectorType, xferOp.padding()); 321 // 3.c. If `options.unroll` is true, insert the 1-D vector in the 322 // aggregate. We must yield and merge with the `then` branch. 323 if (options.unroll) { 324 vector = vector_insert(vector, result, majorIvs); 325 return {vector}; 326 } 327 // 3.d. Otherwise, just go through the temporary `alloc`. 328 memref_store(vector, alloc, majorIvs); 329 return {}; 330 }); 331 332 if (!resultType.empty()) 333 result = *ifResults.begin(); 334 } else { 335 // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read. 336 Value loaded1D = load1DVector(majorIvsPlusOffsets); 337 // 5.a. If `options.unroll` is true, insert the 1-D vector in the 338 // aggregate. 339 if (options.unroll) 340 result = vector_insert(loaded1D, result, majorIvs); 341 // 5.b. Otherwise, just go through the temporary `alloc`. 342 else 343 memref_store(loaded1D, alloc, majorIvs); 344 } 345 }); 346 347 assert((!options.unroll ^ (bool)result) && 348 "Expected resulting Value iff unroll"); 349 if (!result) 350 result = 351 memref_load(vector_type_cast(MemRefType::get({}, vectorType), alloc)); 352 rewriter.replaceOp(op, result); 353 354 return success(); 355 } 356 357 template <> 358 LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() { 359 Value alloc; 360 if (!options.unroll) { 361 alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); 362 memref_store(xferOp.vector(), 363 vector_type_cast(MemRefType::get({}, vectorType), alloc)); 364 } 365 366 emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, 367 ValueRange majorOffsets, ValueRange minorOffsets, 368 const MemRefBoundsCapture &memrefBounds) { 369 // Lower to 1-D vector_transfer_write and let recursion handle it. 370 auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) { 371 SmallVector<Value, 8> indexing; 372 indexing.reserve(leadingRank + majorRank + minorRank); 373 indexing.append(leadingOffsets.begin(), leadingOffsets.end()); 374 indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); 375 indexing.append(minorOffsets.begin(), minorOffsets.end()); 376 Value result; 377 // If `options.unroll` is true, extract the 1-D vector from the 378 // aggregate. 379 if (options.unroll) 380 result = vector_extract(xferOp.vector(), majorIvs); 381 else 382 result = memref_load(alloc, majorIvs); 383 auto map = 384 getTransferMinorIdentityMap(xferOp.getShapedType(), minorVectorType); 385 ArrayAttr inBounds; 386 if (xferOp.isDimInBounds(xferOp.getVectorType().getRank() - 1)) { 387 OpBuilder &b = ScopedContext::getBuilderRef(); 388 inBounds = b.getBoolArrayAttr({true}); 389 } 390 vector_transfer_write(result, xferOp.source(), indexing, 391 AffineMapAttr::get(map), inBounds); 392 }; 393 394 // 1. Compute the inBoundsCondition in the current loops ivs + offset 395 // context. 396 SmallVector<Value, 4> majorIvsPlusOffsets; 397 Value inBoundsCondition = emitInBoundsCondition( 398 rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()), 399 leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); 400 401 if (inBoundsCondition) { 402 // 2.a. If the condition is not null, we need an IfOp, to write 403 // conditionally. Progressively lower to a 1-D transfer write. 404 conditionBuilder(inBoundsCondition, 405 [&] { emitTransferWrite(majorIvsPlusOffsets); }); 406 } else { 407 // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write. 408 emitTransferWrite(majorIvsPlusOffsets); 409 } 410 }); 411 412 rewriter.eraseOp(op); 413 414 return success(); 415 } 416 417 } // namespace 418 419 /// Analyzes the `transfer` to find an access dimension along the fastest remote 420 /// MemRef dimension. If such a dimension with coalescing properties is found, 421 /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of 422 /// LoopNestBuilder captures it in the innermost loop. 423 template <typename TransferOpTy> 424 static int computeCoalescedIndex(TransferOpTy transfer) { 425 // rank of the remote memory access, coalescing behavior occurs on the 426 // innermost memory dimension. 427 auto remoteRank = transfer.getShapedType().getRank(); 428 // Iterate over the results expressions of the permutation map to determine 429 // the loop order for creating pointwise copies between remote and local 430 // memories. 431 int coalescedIdx = -1; 432 auto exprs = transfer.permutation_map().getResults(); 433 for (auto en : llvm::enumerate(exprs)) { 434 auto dim = en.value().template dyn_cast<AffineDimExpr>(); 435 if (!dim) { 436 continue; 437 } 438 auto memRefDim = dim.getPosition(); 439 if (memRefDim == remoteRank - 1) { 440 // memRefDim has coalescing properties, it should be swapped in the last 441 // position. 442 assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices"); 443 coalescedIdx = en.index(); 444 } 445 } 446 return coalescedIdx; 447 } 448 449 template <typename TransferOpTy> 450 VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter( 451 VectorTransferToSCFOptions options, MLIRContext *context) 452 : RewritePattern(TransferOpTy::getOperationName(), 1, context), 453 options(options) {} 454 455 /// Used for staging the transfer in a local buffer. 456 template <typename TransferOpTy> 457 MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType( 458 TransferOpTy transfer) const { 459 auto vectorType = transfer.getVectorType(); 460 return MemRefType::get(vectorType.getShape().drop_back(), 461 VectorType::get(vectorType.getShape().take_back(), 462 vectorType.getElementType()), 463 {}, 0); 464 } 465 466 static void emitWithBoundsChecks( 467 PatternRewriter &rewriter, VectorTransferOpInterface transfer, 468 ValueRange ivs, const MemRefBoundsCapture &memRefBoundsCapture, 469 function_ref<void(ArrayRef<Value>)> inBoundsFun, 470 function_ref<void(ArrayRef<Value>)> outOfBoundsFun = nullptr) { 471 // Permute the incoming indices according to the permutation map. 472 SmallVector<Value, 4> indices = 473 applyMapToValues(rewriter, transfer.getLoc(), transfer.permutation_map(), 474 transfer.indices()); 475 476 // Generate a bounds check if necessary. 477 SmallVector<Value, 4> majorIvsPlusOffsets; 478 Value inBoundsCondition = 479 emitInBoundsCondition(rewriter, transfer, 0, ivs, indices, 480 memRefBoundsCapture, majorIvsPlusOffsets); 481 482 // Apply the permutation map to the ivs. The permutation map may not use all 483 // the inputs. 484 SmallVector<Value, 4> scalarAccessExprs(transfer.indices().size()); 485 for (unsigned memRefDim = 0; memRefDim < transfer.indices().size(); 486 ++memRefDim) { 487 // Linear search on a small number of entries. 488 int loopIndex = -1; 489 auto exprs = transfer.permutation_map().getResults(); 490 for (auto en : llvm::enumerate(exprs)) { 491 auto expr = en.value(); 492 auto dim = expr.dyn_cast<AffineDimExpr>(); 493 // Sanity check. 494 assert((dim || expr.cast<AffineConstantExpr>().getValue() == 0) && 495 "Expected dim or 0 in permutationMap"); 496 if (dim && memRefDim == dim.getPosition()) { 497 loopIndex = en.index(); 498 break; 499 } 500 } 501 502 using namespace edsc::op; 503 auto i = transfer.indices()[memRefDim]; 504 scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex]; 505 } 506 507 if (inBoundsCondition) 508 conditionBuilder( 509 /* scf.if */ inBoundsCondition, // { 510 [&] { inBoundsFun(scalarAccessExprs); }, 511 // } else { 512 outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); } 513 : function_ref<void()>() 514 // } 515 ); 516 else 517 inBoundsFun(scalarAccessExprs); 518 } 519 520 namespace mlir { 521 522 /// Lowers TransferReadOp into a combination of: 523 /// 1. local memory allocation; 524 /// 2. perfect loop nest over: 525 /// a. scalar load from local buffers (viewed as a scalar memref); 526 /// a. scalar store to original memref (with padding). 527 /// 3. vector_load from local buffer (viewed as a memref<1 x vector>); 528 /// 4. local memory deallocation. 529 /// 530 /// Lowers the data transfer part of a TransferReadOp while ensuring no 531 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by 532 /// padding. 533 534 /// Performs the rewrite. 535 template <> 536 LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite( 537 Operation *op, PatternRewriter &rewriter) const { 538 using namespace mlir::edsc::op; 539 540 TransferReadOp transfer = cast<TransferReadOp>(op); 541 if (transfer.mask()) 542 return failure(); 543 auto memRefType = transfer.getShapedType().dyn_cast<MemRefType>(); 544 if (!memRefType) 545 return failure(); 546 // Fall back to a loop if the fastest varying stride is not 1 or it is 547 // permuted. 548 int64_t offset; 549 SmallVector<int64_t, 4> strides; 550 auto successStrides = getStridesAndOffset(memRefType, strides, offset); 551 if (succeeded(successStrides) && strides.back() == 1 && 552 transfer.permutation_map().isMinorIdentity()) { 553 // If > 1D, emit a bunch of loops around 1-D vector transfers. 554 if (transfer.getVectorType().getRank() > 1) 555 return NDTransferOpHelper<TransferReadOp>(rewriter, transfer, options) 556 .doReplace(); 557 // If 1-D this is now handled by the target-specific lowering. 558 if (transfer.getVectorType().getRank() == 1) 559 return failure(); 560 } 561 562 // Conservative lowering to scalar load / stores. 563 // 1. Setup all the captures. 564 ScopedContext scope(rewriter, transfer.getLoc()); 565 MemRefIndexedValue remote(transfer.source()); 566 MemRefBoundsCapture memRefBoundsCapture(transfer.source()); 567 VectorBoundsCapture vectorBoundsCapture(transfer.vector()); 568 int coalescedIdx = computeCoalescedIndex(transfer); 569 // Swap the vectorBoundsCapture which will reorder loop bounds. 570 if (coalescedIdx >= 0) 571 vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, 572 coalescedIdx); 573 574 auto lbs = vectorBoundsCapture.getLbs(); 575 auto ubs = vectorBoundsCapture.getUbs(); 576 SmallVector<Value, 8> steps; 577 steps.reserve(vectorBoundsCapture.getSteps().size()); 578 for (auto step : vectorBoundsCapture.getSteps()) 579 steps.push_back(std_constant_index(step)); 580 581 // 2. Emit alloc-copy-load-dealloc. 582 MLIRContext *ctx = op->getContext(); 583 Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); 584 MemRefIndexedValue local(tmp); 585 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { 586 auto ivsStorage = llvm::to_vector<8>(loopIvs); 587 // Swap the ivs which will reorder memory accesses. 588 if (coalescedIdx >= 0) 589 std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); 590 591 ArrayRef<Value> ivs(ivsStorage); 592 Value pos = std_index_cast(IntegerType::get(ctx, 32), ivs.back()); 593 Value inVector = local(ivs.drop_back()); 594 auto loadValue = [&](ArrayRef<Value> indices) { 595 Value vector = vector_insert_element(remote(indices), inVector, pos); 596 local(ivs.drop_back()) = vector; 597 }; 598 auto loadPadding = [&](ArrayRef<Value>) { 599 Value vector = vector_insert_element(transfer.padding(), inVector, pos); 600 local(ivs.drop_back()) = vector; 601 }; 602 emitWithBoundsChecks( 603 rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs, 604 memRefBoundsCapture, loadValue, loadPadding); 605 }); 606 Value vectorValue = memref_load(vector_type_cast(tmp)); 607 608 // 3. Propagate. 609 rewriter.replaceOp(op, vectorValue); 610 return success(); 611 } 612 613 /// Lowers TransferWriteOp into a combination of: 614 /// 1. local memory allocation; 615 /// 2. vector_store to local buffer (viewed as a memref<1 x vector>); 616 /// 3. perfect loop nest over: 617 /// a. scalar load from local buffers (viewed as a scalar memref); 618 /// a. scalar store to original memref (if in bounds). 619 /// 4. local memory deallocation. 620 /// 621 /// More specifically, lowers the data transfer part while ensuring no 622 /// out-of-bounds accesses are possible. 623 template <> 624 LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite( 625 Operation *op, PatternRewriter &rewriter) const { 626 using namespace edsc::op; 627 628 TransferWriteOp transfer = cast<TransferWriteOp>(op); 629 if (transfer.mask()) 630 return failure(); 631 auto memRefType = transfer.getShapedType().template dyn_cast<MemRefType>(); 632 if (!memRefType) 633 return failure(); 634 635 // Fall back to a loop if the fastest varying stride is not 1 or it is 636 // permuted. 637 int64_t offset; 638 SmallVector<int64_t, 4> strides; 639 auto successStrides = getStridesAndOffset(memRefType, strides, offset); 640 if (succeeded(successStrides) && strides.back() == 1 && 641 transfer.permutation_map().isMinorIdentity()) { 642 // If > 1D, emit a bunch of loops around 1-D vector transfers. 643 if (transfer.getVectorType().getRank() > 1) 644 return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer, options) 645 .doReplace(); 646 // If 1-D this is now handled by the target-specific lowering. 647 if (transfer.getVectorType().getRank() == 1) 648 return failure(); 649 } 650 651 // 1. Setup all the captures. 652 ScopedContext scope(rewriter, transfer.getLoc()); 653 MemRefIndexedValue remote(transfer.source()); 654 MemRefBoundsCapture memRefBoundsCapture(transfer.source()); 655 Value vectorValue(transfer.vector()); 656 VectorBoundsCapture vectorBoundsCapture(transfer.vector()); 657 int coalescedIdx = computeCoalescedIndex(transfer); 658 // Swap the vectorBoundsCapture which will reorder loop bounds. 659 if (coalescedIdx >= 0) 660 vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, 661 coalescedIdx); 662 663 auto lbs = vectorBoundsCapture.getLbs(); 664 auto ubs = vectorBoundsCapture.getUbs(); 665 SmallVector<Value, 8> steps; 666 steps.reserve(vectorBoundsCapture.getSteps().size()); 667 for (auto step : vectorBoundsCapture.getSteps()) 668 steps.push_back(std_constant_index(step)); 669 670 // 2. Emit alloc-store-copy-dealloc. 671 Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); 672 MemRefIndexedValue local(tmp); 673 Value vec = vector_type_cast(tmp); 674 memref_store(vectorValue, vec); 675 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { 676 auto ivsStorage = llvm::to_vector<8>(loopIvs); 677 // Swap the ivsStorage which will reorder memory accesses. 678 if (coalescedIdx >= 0) 679 std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); 680 681 ArrayRef<Value> ivs(ivsStorage); 682 Value pos = 683 std_index_cast(IntegerType::get(op->getContext(), 32), ivs.back()); 684 auto storeValue = [&](ArrayRef<Value> indices) { 685 Value scalar = vector_extract_element(local(ivs.drop_back()), pos); 686 remote(indices) = scalar; 687 }; 688 emitWithBoundsChecks( 689 rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs, 690 memRefBoundsCapture, storeValue); 691 }); 692 693 // 3. Erase. 694 rewriter.eraseOp(op); 695 return success(); 696 } 697 698 void populateVectorToSCFConversionPatterns( 699 RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) { 700 patterns.add<VectorTransferRewriter<vector::TransferReadOp>, 701 VectorTransferRewriter<vector::TransferWriteOp>>( 702 options, patterns.getContext()); 703 } 704 705 } // namespace mlir 706 707 namespace { 708 709 struct ConvertVectorToSCFPass 710 : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> { 711 ConvertVectorToSCFPass() = default; 712 ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { 713 this->fullUnroll = options.unroll; 714 } 715 716 void runOnFunction() override { 717 RewritePatternSet patterns(getFunction().getContext()); 718 populateVectorToSCFConversionPatterns( 719 patterns, VectorTransferToSCFOptions().setUnroll(fullUnroll)); 720 (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns)); 721 } 722 }; 723 724 } // namespace 725 726 std::unique_ptr<Pass> 727 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { 728 return std::make_unique<ConvertVectorToSCFPass>(options); 729 } 730