1 //===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements target-dependent lowering of vector transfer operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <type_traits> 14 15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h" 16 17 #include "../PassDetail.h" 18 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h" 19 #include "mlir/Dialect/Linalg/Utils/Utils.h" 20 #include "mlir/Dialect/SCF/EDSC/Builders.h" 21 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h" 22 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" 23 #include "mlir/Dialect/Vector/EDSC/Intrinsics.h" 24 #include "mlir/Dialect/Vector/VectorOps.h" 25 #include "mlir/Dialect/Vector/VectorUtils.h" 26 #include "mlir/IR/AffineExpr.h" 27 #include "mlir/IR/AffineMap.h" 28 #include "mlir/IR/Attributes.h" 29 #include "mlir/IR/Builders.h" 30 #include "mlir/IR/Location.h" 31 #include "mlir/IR/Matchers.h" 32 #include "mlir/IR/OperationSupport.h" 33 #include "mlir/IR/PatternMatch.h" 34 #include "mlir/IR/Types.h" 35 #include "mlir/Pass/Pass.h" 36 #include "mlir/Transforms/Passes.h" 37 38 #define ALIGNMENT_SIZE 128 39 40 using namespace mlir; 41 using namespace mlir::edsc; 42 using namespace mlir::edsc::intrinsics; 43 using vector::TransferReadOp; 44 using vector::TransferWriteOp; 45 46 namespace { 47 /// Helper class captures the common information needed to lower N>1-D vector 48 /// transfer operations (read and write). 49 /// On construction, this class opens an edsc::ScopedContext for simpler IR 50 /// manipulation. 51 /// In pseudo-IR, for an n-D vector_transfer_read such as: 52 /// 53 /// ``` 54 /// vector_transfer_read(%m, %offsets, identity_map, %fill) : 55 /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, 56 /// vector<(major_dims) x (minor_dims) x type> 57 /// ``` 58 /// 59 /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or 60 /// higher). 61 /// 62 /// This is the entry point to emitting pseudo-IR resembling: 63 /// 64 /// ``` 65 /// %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>> 66 /// for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest 67 /// if (any_of(%ivs_major + %offsets, <, major_dims)) { 68 /// %v = vector_transfer_read( 69 /// {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor}, 70 /// %ivs_minor): 71 /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, 72 /// vector<(minor_dims) x type>; 73 /// store(%v, %tmp); 74 /// } else { 75 /// %v = splat(vector<(minor_dims) x type>, %fill) 76 /// store(%v, %tmp, %ivs_major); 77 /// } 78 /// } 79 /// %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>): 80 // vector<(major_dims) x (minor_dims) x type> 81 /// ``` 82 /// 83 template <typename ConcreteOp> 84 class NDTransferOpHelper { 85 public: 86 NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp, 87 const VectorTransferToSCFOptions &options) 88 : rewriter(rewriter), options(options), loc(xferOp.getLoc()), 89 scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp), 90 op(xferOp.getOperation()) { 91 vectorType = xferOp.getVectorType(); 92 // TODO: when we go to k > 1-D vectors adapt minorRank. 93 minorRank = 1; 94 majorRank = vectorType.getRank() - minorRank; 95 leadingRank = xferOp.getLeadingMemRefRank(); 96 majorVectorType = 97 VectorType::get(vectorType.getShape().take_front(majorRank), 98 vectorType.getElementType()); 99 minorVectorType = 100 VectorType::get(vectorType.getShape().take_back(minorRank), 101 vectorType.getElementType()); 102 /// Memref of minor vector type is used for individual transfers. 103 memRefMinorVectorType = 104 MemRefType::get(majorVectorType.getShape(), minorVectorType, {}, 105 xferOp.getMemRefType().getMemorySpace()); 106 } 107 108 LogicalResult doReplace(); 109 110 private: 111 /// Creates the loop nest on the "major" dimensions and calls the 112 /// `loopBodyBuilder` lambda in the context of the loop nest. 113 template <typename Lambda> 114 void emitLoops(Lambda loopBodyBuilder); 115 116 /// Operate within the body of `emitLoops` to: 117 /// 1. Compute the indexings `majorIvs + majorOffsets` and save them in 118 /// `majorIvsPlusOffsets`. 119 /// 2. Return a boolean that determines whether the first `majorIvs.rank()` 120 /// dimensions `majorIvs + majorOffsets` are all within `memrefBounds`. 121 Value emitInBoundsCondition(ValueRange majorIvs, ValueRange majorOffsets, 122 MemRefBoundsCapture &memrefBounds, 123 SmallVectorImpl<Value> &majorIvsPlusOffsets); 124 125 /// Common state to lower vector transfer ops. 126 PatternRewriter &rewriter; 127 const VectorTransferToSCFOptions &options; 128 Location loc; 129 std::unique_ptr<ScopedContext> scope; 130 ConcreteOp xferOp; 131 Operation *op; 132 // A vector transfer copies data between: 133 // - memref<(leading_dims) x (major_dims) x (minor_dims) x type> 134 // - vector<(major_dims) x (minor_dims) x type> 135 unsigned minorRank; // for now always 1 136 unsigned majorRank; // vector rank - minorRank 137 unsigned leadingRank; // memref rank - vector rank 138 VectorType vectorType; // vector<(major_dims) x (minor_dims) x type> 139 VectorType majorVectorType; // vector<(major_dims) x type> 140 VectorType minorVectorType; // vector<(minor_dims) x type> 141 MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>> 142 }; 143 144 template <typename ConcreteOp> 145 template <typename Lambda> 146 void NDTransferOpHelper<ConcreteOp>::emitLoops(Lambda loopBodyBuilder) { 147 /// Loop nest operates on the major dimensions 148 MemRefBoundsCapture memrefBoundsCapture(xferOp.memref()); 149 150 if (options.unroll) { 151 auto shape = majorVectorType.getShape(); 152 auto strides = computeStrides(shape); 153 unsigned numUnrolledInstances = computeMaxLinearIndex(shape); 154 ValueRange indices(xferOp.indices()); 155 for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) { 156 SmallVector<int64_t, 4> offsets = delinearize(strides, idx); 157 SmallVector<Value, 4> offsetValues = 158 llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value { 159 return std_constant_index(off); 160 })); 161 loopBodyBuilder(offsetValues, indices.take_front(leadingRank), 162 indices.drop_front(leadingRank).take_front(majorRank), 163 indices.take_back(minorRank), memrefBoundsCapture); 164 } 165 } else { 166 VectorBoundsCapture vectorBoundsCapture(majorVectorType); 167 auto majorLbs = vectorBoundsCapture.getLbs(); 168 auto majorUbs = vectorBoundsCapture.getUbs(); 169 auto majorSteps = vectorBoundsCapture.getSteps(); 170 affineLoopNestBuilder( 171 majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) { 172 ValueRange indices(xferOp.indices()); 173 loopBodyBuilder(majorIvs, indices.take_front(leadingRank), 174 indices.drop_front(leadingRank).take_front(majorRank), 175 indices.take_back(minorRank), memrefBoundsCapture); 176 }); 177 } 178 } 179 180 static Optional<int64_t> extractConstantIndex(Value v) { 181 if (auto cstOp = v.getDefiningOp<ConstantIndexOp>()) 182 return cstOp.getValue(); 183 if (auto affineApplyOp = v.getDefiningOp<AffineApplyOp>()) 184 if (affineApplyOp.getAffineMap().isSingleConstant()) 185 return affineApplyOp.getAffineMap().getSingleConstantResult(); 186 return None; 187 } 188 189 // Missing foldings of scf.if make it necessary to perform poor man's folding 190 // eagerly, especially in the case of unrolling. In the future, this should go 191 // away once scf.if folds properly. 192 static Value onTheFlyFoldSLT(Value v, Value ub) { 193 using namespace mlir::edsc::op; 194 auto maybeCstV = extractConstantIndex(v); 195 auto maybeCstUb = extractConstantIndex(ub); 196 if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb) 197 return Value(); 198 return slt(v, ub); 199 } 200 201 template <typename ConcreteOp> 202 Value NDTransferOpHelper<ConcreteOp>::emitInBoundsCondition( 203 ValueRange majorIvs, ValueRange majorOffsets, 204 MemRefBoundsCapture &memrefBounds, 205 SmallVectorImpl<Value> &majorIvsPlusOffsets) { 206 Value inBoundsCondition; 207 majorIvsPlusOffsets.reserve(majorIvs.size()); 208 unsigned idx = 0; 209 SmallVector<Value, 4> bounds = 210 linalg::applyMapToValues(rewriter, xferOp.getLoc(), 211 xferOp.permutation_map(), memrefBounds.getUbs()); 212 for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) { 213 Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it); 214 using namespace mlir::edsc::op; 215 majorIvsPlusOffsets.push_back(iv + off); 216 if (xferOp.isMaskedDim(leadingRank + idx)) { 217 Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub); 218 if (inBoundsCond) 219 inBoundsCondition = (inBoundsCondition) 220 ? (inBoundsCondition && inBoundsCond) 221 : inBoundsCond; 222 } 223 ++idx; 224 } 225 return inBoundsCondition; 226 } 227 228 // TODO: Parallelism and threadlocal considerations. 229 static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType, 230 Operation *op) { 231 auto &b = ScopedContext::getBuilderRef(); 232 OpBuilder::InsertionGuard guard(b); 233 Operation *scope = 234 op->getParentWithTrait<OpTrait::AutomaticAllocationScope>(); 235 assert(scope && "Expected op to be inside automatic allocation scope"); 236 b.setInsertionPointToStart(&scope->getRegion(0).front()); 237 Value res = std_alloca(memRefMinorVectorType, ValueRange{}, 238 b.getI64IntegerAttr(ALIGNMENT_SIZE)); 239 return res; 240 } 241 242 template <> 243 LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() { 244 Value alloc, result; 245 if (options.unroll) 246 result = std_splat(vectorType, xferOp.padding()); 247 else 248 alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); 249 250 emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, 251 ValueRange majorOffsets, ValueRange minorOffsets, 252 MemRefBoundsCapture &memrefBounds) { 253 /// Lambda to load 1-D vector in the current loop ivs + offset context. 254 auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value { 255 SmallVector<Value, 8> indexing; 256 indexing.reserve(leadingRank + majorRank + minorRank); 257 indexing.append(leadingOffsets.begin(), leadingOffsets.end()); 258 indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); 259 indexing.append(minorOffsets.begin(), minorOffsets.end()); 260 Value memref = xferOp.memref(); 261 auto map = 262 getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType); 263 ArrayAttr masked; 264 if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) { 265 OpBuilder &b = ScopedContext::getBuilderRef(); 266 masked = b.getBoolArrayAttr({false}); 267 } 268 return vector_transfer_read(minorVectorType, memref, indexing, 269 AffineMapAttr::get(map), xferOp.padding(), 270 masked); 271 }; 272 273 // 1. Compute the inBoundsCondition in the current loops ivs + offset 274 // context. 275 SmallVector<Value, 4> majorIvsPlusOffsets; 276 Value inBoundsCondition = emitInBoundsCondition( 277 majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); 278 279 if (inBoundsCondition) { 280 // 2. If the condition is not null, we need an IfOp, which may yield 281 // if `options.unroll` is true. 282 SmallVector<Type, 1> resultType; 283 if (options.unroll) 284 resultType.push_back(vectorType); 285 286 // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise 287 // splat a 1-D vector. 288 ValueRange ifResults = conditionBuilder( 289 resultType, inBoundsCondition, 290 [&]() -> scf::ValueVector { 291 Value vector = load1DVector(majorIvsPlusOffsets); 292 // 3.a. If `options.unroll` is true, insert the 1-D vector in the 293 // aggregate. We must yield and merge with the `else` branch. 294 if (options.unroll) { 295 vector = vector_insert(vector, result, majorIvs); 296 return {vector}; 297 } 298 // 3.b. Otherwise, just go through the temporary `alloc`. 299 std_store(vector, alloc, majorIvs); 300 return {}; 301 }, 302 [&]() -> scf::ValueVector { 303 Value vector = std_splat(minorVectorType, xferOp.padding()); 304 // 3.c. If `options.unroll` is true, insert the 1-D vector in the 305 // aggregate. We must yield and merge with the `then` branch. 306 if (options.unroll) { 307 vector = vector_insert(vector, result, majorIvs); 308 return {vector}; 309 } 310 // 3.d. Otherwise, just go through the temporary `alloc`. 311 std_store(vector, alloc, majorIvs); 312 return {}; 313 }); 314 315 if (!resultType.empty()) 316 result = *ifResults.begin(); 317 } else { 318 // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read. 319 Value loaded1D = load1DVector(majorIvsPlusOffsets); 320 // 5.a. If `options.unroll` is true, insert the 1-D vector in the 321 // aggregate. 322 if (options.unroll) 323 result = vector_insert(loaded1D, result, majorIvs); 324 // 5.b. Otherwise, just go through the temporary `alloc`. 325 else 326 std_store(loaded1D, alloc, majorIvs); 327 } 328 }); 329 330 assert((!options.unroll ^ (bool)result) && 331 "Expected resulting Value iff unroll"); 332 if (!result) 333 result = std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc)); 334 rewriter.replaceOp(op, result); 335 336 return success(); 337 } 338 339 template <> 340 LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() { 341 Value alloc; 342 if (!options.unroll) { 343 alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); 344 std_store(xferOp.vector(), 345 vector_type_cast(MemRefType::get({}, vectorType), alloc)); 346 } 347 348 emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, 349 ValueRange majorOffsets, ValueRange minorOffsets, 350 MemRefBoundsCapture &memrefBounds) { 351 // Lower to 1-D vector_transfer_write and let recursion handle it. 352 auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) { 353 SmallVector<Value, 8> indexing; 354 indexing.reserve(leadingRank + majorRank + minorRank); 355 indexing.append(leadingOffsets.begin(), leadingOffsets.end()); 356 indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); 357 indexing.append(minorOffsets.begin(), minorOffsets.end()); 358 Value result; 359 // If `options.unroll` is true, extract the 1-D vector from the 360 // aggregate. 361 if (options.unroll) 362 result = vector_extract(xferOp.vector(), majorIvs); 363 else 364 result = std_load(alloc, majorIvs); 365 auto map = 366 getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType); 367 ArrayAttr masked; 368 if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) { 369 OpBuilder &b = ScopedContext::getBuilderRef(); 370 masked = b.getBoolArrayAttr({false}); 371 } 372 vector_transfer_write(result, xferOp.memref(), indexing, 373 AffineMapAttr::get(map), masked); 374 }; 375 376 // 1. Compute the inBoundsCondition in the current loops ivs + offset 377 // context. 378 SmallVector<Value, 4> majorIvsPlusOffsets; 379 Value inBoundsCondition = emitInBoundsCondition( 380 majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); 381 382 if (inBoundsCondition) { 383 // 2.a. If the condition is not null, we need an IfOp, to write 384 // conditionally. Progressively lower to a 1-D transfer write. 385 conditionBuilder(inBoundsCondition, 386 [&] { emitTransferWrite(majorIvsPlusOffsets); }); 387 } else { 388 // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write. 389 emitTransferWrite(majorIvsPlusOffsets); 390 } 391 }); 392 393 rewriter.eraseOp(op); 394 395 return success(); 396 } 397 398 } // namespace 399 400 /// Analyzes the `transfer` to find an access dimension along the fastest remote 401 /// MemRef dimension. If such a dimension with coalescing properties is found, 402 /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of 403 /// LoopNestBuilder captures it in the innermost loop. 404 template <typename TransferOpTy> 405 static int computeCoalescedIndex(TransferOpTy transfer) { 406 // rank of the remote memory access, coalescing behavior occurs on the 407 // innermost memory dimension. 408 auto remoteRank = transfer.getMemRefType().getRank(); 409 // Iterate over the results expressions of the permutation map to determine 410 // the loop order for creating pointwise copies between remote and local 411 // memories. 412 int coalescedIdx = -1; 413 auto exprs = transfer.permutation_map().getResults(); 414 for (auto en : llvm::enumerate(exprs)) { 415 auto dim = en.value().template dyn_cast<AffineDimExpr>(); 416 if (!dim) { 417 continue; 418 } 419 auto memRefDim = dim.getPosition(); 420 if (memRefDim == remoteRank - 1) { 421 // memRefDim has coalescing properties, it should be swapped in the last 422 // position. 423 assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices"); 424 coalescedIdx = en.index(); 425 } 426 } 427 return coalescedIdx; 428 } 429 430 /// Emits remote memory accesses that are clipped to the boundaries of the 431 /// MemRef. 432 template <typename TransferOpTy> 433 static SmallVector<Value, 8> 434 clip(TransferOpTy transfer, MemRefBoundsCapture &bounds, ArrayRef<Value> ivs) { 435 using namespace mlir::edsc; 436 437 Value zero(std_constant_index(0)), one(std_constant_index(1)); 438 SmallVector<Value, 8> memRefAccess(transfer.indices()); 439 SmallVector<Value, 8> clippedScalarAccessExprs(memRefAccess.size()); 440 // Indices accessing to remote memory are clipped and their expressions are 441 // returned in clippedScalarAccessExprs. 442 for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size(); 443 ++memRefDim) { 444 // Linear search on a small number of entries. 445 int loopIndex = -1; 446 auto exprs = transfer.permutation_map().getResults(); 447 for (auto en : llvm::enumerate(exprs)) { 448 auto expr = en.value(); 449 auto dim = expr.template dyn_cast<AffineDimExpr>(); 450 // Sanity check. 451 assert( 452 (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) && 453 "Expected dim or 0 in permutationMap"); 454 if (dim && memRefDim == dim.getPosition()) { 455 loopIndex = en.index(); 456 break; 457 } 458 } 459 460 // We cannot distinguish atm between unrolled dimensions that implement 461 // the "always full" tile abstraction and need clipping from the other 462 // ones. So we conservatively clip everything. 463 using namespace edsc::op; 464 auto N = bounds.ub(memRefDim); 465 auto i = memRefAccess[memRefDim]; 466 if (loopIndex < 0) { 467 auto N_minus_1 = N - one; 468 auto select_1 = std_select(slt(i, N), i, N_minus_1); 469 clippedScalarAccessExprs[memRefDim] = 470 std_select(slt(i, zero), zero, select_1); 471 } else { 472 auto ii = ivs[loopIndex]; 473 auto i_plus_ii = i + ii; 474 auto N_minus_1 = N - one; 475 auto select_1 = std_select(slt(i_plus_ii, N), i_plus_ii, N_minus_1); 476 clippedScalarAccessExprs[memRefDim] = 477 std_select(slt(i_plus_ii, zero), zero, select_1); 478 } 479 } 480 481 return clippedScalarAccessExprs; 482 } 483 484 namespace mlir { 485 486 template <typename TransferOpTy> 487 VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter( 488 VectorTransferToSCFOptions options, MLIRContext *context) 489 : RewritePattern(TransferOpTy::getOperationName(), 1, context), 490 options(options) {} 491 492 /// Used for staging the transfer in a local buffer. 493 template <typename TransferOpTy> 494 MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType( 495 TransferOpTy transfer) const { 496 auto vectorType = transfer.getVectorType(); 497 return MemRefType::get(vectorType.getShape(), vectorType.getElementType(), {}, 498 0); 499 } 500 501 /// Lowers TransferReadOp into a combination of: 502 /// 1. local memory allocation; 503 /// 2. perfect loop nest over: 504 /// a. scalar load from local buffers (viewed as a scalar memref); 505 /// a. scalar store to original memref (with clipping). 506 /// 3. vector_load from local buffer (viewed as a memref<1 x vector>); 507 /// 4. local memory deallocation. 508 /// 509 /// Lowers the data transfer part of a TransferReadOp while ensuring no 510 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by 511 /// clipping. This means that a given value in memory can be read multiple 512 /// times and concurrently. 513 /// 514 /// Important notes about clipping and "full-tiles only" abstraction: 515 /// ================================================================= 516 /// When using clipping for dealing with boundary conditions, the same edge 517 /// value will appear multiple times (a.k.a edge padding). This is fine if the 518 /// subsequent vector operations are all data-parallel but **is generally 519 /// incorrect** in the presence of reductions or extract operations. 520 /// 521 /// More generally, clipping is a scalar abstraction that is expected to work 522 /// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs. 523 /// To deal with real vector_load and DMAs, a "padded allocation + view" 524 /// abstraction with the ability to read out-of-memref-bounds (but still within 525 /// the allocated region) is necessary. 526 /// 527 /// Whether using scalar loops or vector_load/DMAs to perform the transfer, 528 /// junk values will be materialized in the vectors and generally need to be 529 /// filtered out and replaced by the "neutral element". This neutral element is 530 /// op-dependent so, in the future, we expect to create a vector filter and 531 /// apply it to a splatted constant vector with the proper neutral element at 532 /// each ssa-use. This filtering is not necessary for pure data-parallel 533 /// operations. 534 /// 535 /// In the case of vector_store/DMAs, Read-Modify-Write will be required, which 536 /// also have concurrency implications. Note that by using clipped scalar stores 537 /// in the presence of data-parallel only operations, we generate code that 538 /// writes the same value multiple time on the edge locations. 539 /// 540 /// TODO: implement alternatives to clipping. 541 /// TODO: support non-data-parallel operations. 542 543 /// Performs the rewrite. 544 template <> 545 LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite( 546 Operation *op, PatternRewriter &rewriter) const { 547 using namespace mlir::edsc::op; 548 549 TransferReadOp transfer = cast<TransferReadOp>(op); 550 if (transfer.permutation_map().isMinorIdentity()) { 551 // If > 1D, emit a bunch of loops around 1-D vector transfers. 552 if (transfer.getVectorType().getRank() > 1) 553 return NDTransferOpHelper<TransferReadOp>(rewriter, transfer, options) 554 .doReplace(); 555 // If 1-D this is now handled by the target-specific lowering. 556 if (transfer.getVectorType().getRank() == 1) 557 return failure(); 558 } 559 560 // Conservative lowering to scalar load / stores. 561 // 1. Setup all the captures. 562 ScopedContext scope(rewriter, transfer.getLoc()); 563 StdIndexedValue remote(transfer.memref()); 564 MemRefBoundsCapture memRefBoundsCapture(transfer.memref()); 565 VectorBoundsCapture vectorBoundsCapture(transfer.vector()); 566 int coalescedIdx = computeCoalescedIndex(transfer); 567 // Swap the vectorBoundsCapture which will reorder loop bounds. 568 if (coalescedIdx >= 0) 569 vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, 570 coalescedIdx); 571 572 auto lbs = vectorBoundsCapture.getLbs(); 573 auto ubs = vectorBoundsCapture.getUbs(); 574 SmallVector<Value, 8> steps; 575 steps.reserve(vectorBoundsCapture.getSteps().size()); 576 for (auto step : vectorBoundsCapture.getSteps()) 577 steps.push_back(std_constant_index(step)); 578 579 // 2. Emit alloc-copy-load-dealloc. 580 Value tmp = std_alloc(tmpMemRefType(transfer), ValueRange{}, 581 rewriter.getI64IntegerAttr(ALIGNMENT_SIZE)); 582 StdIndexedValue local(tmp); 583 Value vec = vector_type_cast(tmp); 584 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { 585 auto ivs = llvm::to_vector<8>(loopIvs); 586 // Swap the ivs which will reorder memory accesses. 587 if (coalescedIdx >= 0) 588 std::swap(ivs.back(), ivs[coalescedIdx]); 589 // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist). 590 local(ivs) = remote(clip(transfer, memRefBoundsCapture, ivs)); 591 }); 592 Value vectorValue = std_load(vec); 593 (std_dealloc(tmp)); // vexing parse 594 595 // 3. Propagate. 596 rewriter.replaceOp(op, vectorValue); 597 return success(); 598 } 599 600 /// Lowers TransferWriteOp into a combination of: 601 /// 1. local memory allocation; 602 /// 2. vector_store to local buffer (viewed as a memref<1 x vector>); 603 /// 3. perfect loop nest over: 604 /// a. scalar load from local buffers (viewed as a scalar memref); 605 /// a. scalar store to original memref (with clipping). 606 /// 4. local memory deallocation. 607 /// 608 /// More specifically, lowers the data transfer part while ensuring no 609 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by 610 /// clipping. This means that a given value in memory can be written to multiple 611 /// times and concurrently. 612 /// 613 /// See `Important notes about clipping and full-tiles only abstraction` in the 614 /// description of `readClipped` above. 615 /// 616 /// TODO: implement alternatives to clipping. 617 /// TODO: support non-data-parallel operations. 618 template <> 619 LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite( 620 Operation *op, PatternRewriter &rewriter) const { 621 using namespace edsc::op; 622 623 TransferWriteOp transfer = cast<TransferWriteOp>(op); 624 if (transfer.permutation_map().isMinorIdentity()) { 625 // If > 1D, emit a bunch of loops around 1-D vector transfers. 626 if (transfer.getVectorType().getRank() > 1) 627 return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer, options) 628 .doReplace(); 629 // If 1-D this is now handled by the target-specific lowering. 630 if (transfer.getVectorType().getRank() == 1) 631 return failure(); 632 } 633 634 // 1. Setup all the captures. 635 ScopedContext scope(rewriter, transfer.getLoc()); 636 StdIndexedValue remote(transfer.memref()); 637 MemRefBoundsCapture memRefBoundsCapture(transfer.memref()); 638 Value vectorValue(transfer.vector()); 639 VectorBoundsCapture vectorBoundsCapture(transfer.vector()); 640 int coalescedIdx = computeCoalescedIndex(transfer); 641 // Swap the vectorBoundsCapture which will reorder loop bounds. 642 if (coalescedIdx >= 0) 643 vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, 644 coalescedIdx); 645 646 auto lbs = vectorBoundsCapture.getLbs(); 647 auto ubs = vectorBoundsCapture.getUbs(); 648 SmallVector<Value, 8> steps; 649 steps.reserve(vectorBoundsCapture.getSteps().size()); 650 for (auto step : vectorBoundsCapture.getSteps()) 651 steps.push_back(std_constant_index(step)); 652 653 // 2. Emit alloc-store-copy-dealloc. 654 Value tmp = std_alloc(tmpMemRefType(transfer), ValueRange{}, 655 rewriter.getI64IntegerAttr(ALIGNMENT_SIZE)); 656 StdIndexedValue local(tmp); 657 Value vec = vector_type_cast(tmp); 658 std_store(vectorValue, vec); 659 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { 660 auto ivs = llvm::to_vector<8>(loopIvs); 661 // Swap the ivs which will reorder memory accesses. 662 if (coalescedIdx >= 0) 663 std::swap(ivs.back(), ivs[coalescedIdx]); 664 // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist). 665 remote(clip(transfer, memRefBoundsCapture, ivs)) = local(ivs); 666 }); 667 (std_dealloc(tmp)); // vexing parse... 668 669 rewriter.eraseOp(op); 670 return success(); 671 } 672 673 void populateVectorToSCFConversionPatterns( 674 OwningRewritePatternList &patterns, MLIRContext *context, 675 const VectorTransferToSCFOptions &options) { 676 patterns.insert<VectorTransferRewriter<vector::TransferReadOp>, 677 VectorTransferRewriter<vector::TransferWriteOp>>(options, 678 context); 679 } 680 681 } // namespace mlir 682 683 namespace { 684 685 struct ConvertVectorToSCFPass 686 : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> { 687 ConvertVectorToSCFPass() = default; 688 ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { 689 this->fullUnroll = options.unroll; 690 } 691 692 void runOnFunction() override { 693 OwningRewritePatternList patterns; 694 auto *context = getFunction().getContext(); 695 populateVectorToSCFConversionPatterns( 696 patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll)); 697 applyPatternsAndFoldGreedily(getFunction(), patterns); 698 } 699 }; 700 701 } // namespace 702 703 std::unique_ptr<Pass> 704 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { 705 return std::make_unique<ConvertVectorToSCFPass>(options); 706 } 707