1 //===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements target-dependent lowering of vector transfer operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <type_traits>
14 
15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
16 
17 #include "../PassDetail.h"
18 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
19 #include "mlir/Dialect/Linalg/Utils/Utils.h"
20 #include "mlir/Dialect/SCF/EDSC/Builders.h"
21 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h"
22 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
23 #include "mlir/Dialect/Vector/EDSC/Intrinsics.h"
24 #include "mlir/Dialect/Vector/VectorOps.h"
25 #include "mlir/Dialect/Vector/VectorUtils.h"
26 #include "mlir/IR/AffineExpr.h"
27 #include "mlir/IR/AffineMap.h"
28 #include "mlir/IR/Attributes.h"
29 #include "mlir/IR/Builders.h"
30 #include "mlir/IR/Location.h"
31 #include "mlir/IR/Matchers.h"
32 #include "mlir/IR/OperationSupport.h"
33 #include "mlir/IR/PatternMatch.h"
34 #include "mlir/IR/Types.h"
35 #include "mlir/Pass/Pass.h"
36 #include "mlir/Transforms/Passes.h"
37 
38 using namespace mlir;
39 using namespace mlir::edsc;
40 using namespace mlir::edsc::intrinsics;
41 using vector::TransferReadOp;
42 using vector::TransferWriteOp;
43 
44 namespace {
45 /// Helper class captures the common information needed to lower N>1-D vector
46 /// transfer operations (read and write).
47 /// On construction, this class opens an edsc::ScopedContext for simpler IR
48 /// manipulation.
49 /// In pseudo-IR, for an n-D vector_transfer_read such as:
50 ///
51 /// ```
52 ///   vector_transfer_read(%m, %offsets, identity_map, %fill) :
53 ///     memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
54 ///     vector<(major_dims) x (minor_dims) x type>
55 /// ```
56 ///
57 /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or
58 /// higher).
59 ///
60 /// This is the entry point to emitting pseudo-IR resembling:
61 ///
62 /// ```
63 ///   %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>>
64 ///   for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest
65 ///     if (any_of(%ivs_major + %offsets, <, major_dims)) {
66 ///       %v = vector_transfer_read(
67 ///         {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor},
68 ///          %ivs_minor):
69 ///         memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
70 ///         vector<(minor_dims) x type>;
71 ///       store(%v, %tmp);
72 ///     } else {
73 ///       %v = splat(vector<(minor_dims) x type>, %fill)
74 ///       store(%v, %tmp, %ivs_major);
75 ///     }
76 ///   }
77 ///   %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>):
78 //      vector<(major_dims) x (minor_dims) x type>
79 /// ```
80 ///
81 template <typename ConcreteOp>
82 class NDTransferOpHelper {
83 public:
84   NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp,
85                      const VectorTransferToSCFOptions &options)
86       : rewriter(rewriter), options(options), loc(xferOp.getLoc()),
87         scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp),
88         op(xferOp.getOperation()) {
89     vectorType = xferOp.getVectorType();
90     // TODO: when we go to k > 1-D vectors adapt minorRank.
91     minorRank = 1;
92     majorRank = vectorType.getRank() - minorRank;
93     leadingRank = xferOp.getLeadingMemRefRank();
94     majorVectorType =
95         VectorType::get(vectorType.getShape().take_front(majorRank),
96                         vectorType.getElementType());
97     minorVectorType =
98         VectorType::get(vectorType.getShape().take_back(minorRank),
99                         vectorType.getElementType());
100     /// Memref of minor vector type is used for individual transfers.
101     memRefMinorVectorType =
102         MemRefType::get(majorVectorType.getShape(), minorVectorType, {},
103                         xferOp.getMemRefType().getMemorySpace());
104   }
105 
106   LogicalResult doReplace();
107 
108 private:
109   /// Creates the loop nest on the "major" dimensions and calls the
110   /// `loopBodyBuilder` lambda in the context of the loop nest.
111   template <typename Lambda>
112   void emitLoops(Lambda loopBodyBuilder);
113 
114   /// Operate within the body of `emitLoops` to:
115   ///   1. Compute the indexings `majorIvs + majorOffsets` and save them in
116   ///      `majorIvsPlusOffsets`.
117   ///   2. Return a boolean that determines whether the first `majorIvs.rank()`
118   ///      dimensions `majorIvs + majorOffsets` are all within `memrefBounds`.
119   Value emitInBoundsCondition(ValueRange majorIvs, ValueRange majorOffsets,
120                               MemRefBoundsCapture &memrefBounds,
121                               SmallVectorImpl<Value> &majorIvsPlusOffsets);
122 
123   /// Common state to lower vector transfer ops.
124   PatternRewriter &rewriter;
125   const VectorTransferToSCFOptions &options;
126   Location loc;
127   std::unique_ptr<ScopedContext> scope;
128   ConcreteOp xferOp;
129   Operation *op;
130   // A vector transfer copies data between:
131   //   - memref<(leading_dims) x (major_dims) x (minor_dims) x type>
132   //   - vector<(major_dims) x (minor_dims) x type>
133   unsigned minorRank;         // for now always 1
134   unsigned majorRank;         // vector rank - minorRank
135   unsigned leadingRank;       // memref rank - vector rank
136   VectorType vectorType;      // vector<(major_dims) x (minor_dims) x type>
137   VectorType majorVectorType; // vector<(major_dims) x type>
138   VectorType minorVectorType; // vector<(minor_dims) x type>
139   MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>>
140 };
141 
142 template <typename ConcreteOp>
143 template <typename Lambda>
144 void NDTransferOpHelper<ConcreteOp>::emitLoops(Lambda loopBodyBuilder) {
145   /// Loop nest operates on the major dimensions
146   MemRefBoundsCapture memrefBoundsCapture(xferOp.memref());
147 
148   if (options.unroll) {
149     auto shape = majorVectorType.getShape();
150     auto strides = computeStrides(shape);
151     unsigned numUnrolledInstances = computeMaxLinearIndex(shape);
152     ValueRange indices(xferOp.indices());
153     for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) {
154       SmallVector<int64_t, 4> offsets = delinearize(strides, idx);
155       SmallVector<Value, 4> offsetValues =
156           llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value {
157             return std_constant_index(off);
158           }));
159       loopBodyBuilder(offsetValues, indices.take_front(leadingRank),
160                       indices.drop_front(leadingRank).take_front(majorRank),
161                       indices.take_back(minorRank), memrefBoundsCapture);
162     }
163   } else {
164     VectorBoundsCapture vectorBoundsCapture(majorVectorType);
165     auto majorLbs = vectorBoundsCapture.getLbs();
166     auto majorUbs = vectorBoundsCapture.getUbs();
167     auto majorSteps = vectorBoundsCapture.getSteps();
168     affineLoopNestBuilder(
169         majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) {
170           ValueRange indices(xferOp.indices());
171           loopBodyBuilder(majorIvs, indices.take_front(leadingRank),
172                           indices.drop_front(leadingRank).take_front(majorRank),
173                           indices.take_back(minorRank), memrefBoundsCapture);
174         });
175   }
176 }
177 
178 static Optional<int64_t> extractConstantIndex(Value v) {
179   if (auto cstOp = v.getDefiningOp<ConstantIndexOp>())
180     return cstOp.getValue();
181   if (auto affineApplyOp = v.getDefiningOp<AffineApplyOp>())
182     if (affineApplyOp.getAffineMap().isSingleConstant())
183       return affineApplyOp.getAffineMap().getSingleConstantResult();
184   return None;
185 }
186 
187 // Missing foldings of scf.if make it necessary to perform poor man's folding
188 // eagerly, especially in the case of unrolling. In the future, this should go
189 // away once scf.if folds properly.
190 static Value onTheFlyFoldSLT(Value v, Value ub) {
191   using namespace mlir::edsc::op;
192   auto maybeCstV = extractConstantIndex(v);
193   auto maybeCstUb = extractConstantIndex(ub);
194   if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb)
195     return Value();
196   return slt(v, ub);
197 }
198 
199 template <typename ConcreteOp>
200 Value NDTransferOpHelper<ConcreteOp>::emitInBoundsCondition(
201     ValueRange majorIvs, ValueRange majorOffsets,
202     MemRefBoundsCapture &memrefBounds,
203     SmallVectorImpl<Value> &majorIvsPlusOffsets) {
204   Value inBoundsCondition;
205   majorIvsPlusOffsets.reserve(majorIvs.size());
206   unsigned idx = 0;
207   SmallVector<Value, 4> bounds =
208       linalg::applyMapToValues(rewriter, xferOp.getLoc(),
209                                xferOp.permutation_map(), memrefBounds.getUbs());
210   for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) {
211     Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it);
212     using namespace mlir::edsc::op;
213     majorIvsPlusOffsets.push_back(iv + off);
214     if (xferOp.isMaskedDim(leadingRank + idx)) {
215       Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub);
216       if (inBoundsCond)
217         inBoundsCondition = (inBoundsCondition)
218                                 ? (inBoundsCondition && inBoundsCond)
219                                 : inBoundsCond;
220     }
221     ++idx;
222   }
223   return inBoundsCondition;
224 }
225 
226 // TODO: Parallelism and threadlocal considerations.
227 static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType,
228                                      Operation *op) {
229   auto &b = ScopedContext::getBuilderRef();
230   OpBuilder::InsertionGuard guard(b);
231   Operation *scope =
232       op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
233   assert(scope && "Expected op to be inside automatic allocation scope");
234   b.setInsertionPointToStart(&scope->getRegion(0).front());
235   Value res =
236       std_alloca(memRefMinorVectorType, ValueRange{}, b.getI64IntegerAttr(128));
237   return res;
238 }
239 
240 template <>
241 LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
242   Value alloc, result;
243   if (options.unroll)
244     result = std_splat(vectorType, xferOp.padding());
245   else
246     alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
247 
248   emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
249                 ValueRange majorOffsets, ValueRange minorOffsets,
250                 MemRefBoundsCapture &memrefBounds) {
251     /// Lambda to load 1-D vector in the current loop ivs + offset context.
252     auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value {
253       SmallVector<Value, 8> indexing;
254       indexing.reserve(leadingRank + majorRank + minorRank);
255       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
256       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
257       indexing.append(minorOffsets.begin(), minorOffsets.end());
258       Value memref = xferOp.memref();
259       auto map =
260           getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType);
261       ArrayAttr masked;
262       if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
263         OpBuilder &b = ScopedContext::getBuilderRef();
264         masked = b.getBoolArrayAttr({false});
265       }
266       return vector_transfer_read(minorVectorType, memref, indexing,
267                                   AffineMapAttr::get(map), xferOp.padding(),
268                                   masked);
269     };
270 
271     // 1. Compute the inBoundsCondition in the current loops ivs + offset
272     // context.
273     SmallVector<Value, 4> majorIvsPlusOffsets;
274     Value inBoundsCondition = emitInBoundsCondition(
275         majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
276 
277     if (inBoundsCondition) {
278       // 2. If the condition is not null, we need an IfOp, which may yield
279       // if `options.unroll` is true.
280       SmallVector<Type, 1> resultType;
281       if (options.unroll)
282         resultType.push_back(vectorType);
283 
284       // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise
285       // splat a 1-D vector.
286       ValueRange ifResults = conditionBuilder(
287           resultType, inBoundsCondition,
288           [&]() -> scf::ValueVector {
289             Value vector = load1DVector(majorIvsPlusOffsets);
290             // 3.a. If `options.unroll` is true, insert the 1-D vector in the
291             // aggregate. We must yield and merge with the `else` branch.
292             if (options.unroll) {
293               vector = vector_insert(vector, result, majorIvs);
294               return {vector};
295             }
296             // 3.b. Otherwise, just go through the temporary `alloc`.
297             std_store(vector, alloc, majorIvs);
298             return {};
299           },
300           [&]() -> scf::ValueVector {
301             Value vector = std_splat(minorVectorType, xferOp.padding());
302             // 3.c. If `options.unroll` is true, insert the 1-D vector in the
303             // aggregate. We must yield and merge with the `then` branch.
304             if (options.unroll) {
305               vector = vector_insert(vector, result, majorIvs);
306               return {vector};
307             }
308             // 3.d. Otherwise, just go through the temporary `alloc`.
309             std_store(vector, alloc, majorIvs);
310             return {};
311           });
312 
313       if (!resultType.empty())
314         result = *ifResults.begin();
315     } else {
316       // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read.
317       Value loaded1D = load1DVector(majorIvsPlusOffsets);
318       // 5.a. If `options.unroll` is true, insert the 1-D vector in the
319       // aggregate.
320       if (options.unroll)
321         result = vector_insert(loaded1D, result, majorIvs);
322       // 5.b. Otherwise, just go through the temporary `alloc`.
323       else
324         std_store(loaded1D, alloc, majorIvs);
325     }
326   });
327 
328   assert((!options.unroll ^ (bool)result) &&
329          "Expected resulting Value iff unroll");
330   if (!result)
331     result = std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc));
332   rewriter.replaceOp(op, result);
333 
334   return success();
335 }
336 
337 template <>
338 LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
339   Value alloc;
340   if (!options.unroll) {
341     alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
342     std_store(xferOp.vector(),
343               vector_type_cast(MemRefType::get({}, vectorType), alloc));
344   }
345 
346   emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
347                 ValueRange majorOffsets, ValueRange minorOffsets,
348                 MemRefBoundsCapture &memrefBounds) {
349     // Lower to 1-D vector_transfer_write and let recursion handle it.
350     auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) {
351       SmallVector<Value, 8> indexing;
352       indexing.reserve(leadingRank + majorRank + minorRank);
353       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
354       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
355       indexing.append(minorOffsets.begin(), minorOffsets.end());
356       Value result;
357       // If `options.unroll` is true, extract the 1-D vector from the
358       // aggregate.
359       if (options.unroll)
360         result = vector_extract(xferOp.vector(), majorIvs);
361       else
362         result = std_load(alloc, majorIvs);
363       auto map =
364           getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType);
365       ArrayAttr masked;
366       if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
367         OpBuilder &b = ScopedContext::getBuilderRef();
368         masked = b.getBoolArrayAttr({false});
369       }
370       vector_transfer_write(result, xferOp.memref(), indexing,
371                             AffineMapAttr::get(map), masked);
372     };
373 
374     // 1. Compute the inBoundsCondition in the current loops ivs + offset
375     // context.
376     SmallVector<Value, 4> majorIvsPlusOffsets;
377     Value inBoundsCondition = emitInBoundsCondition(
378         majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
379 
380     if (inBoundsCondition) {
381       // 2.a. If the condition is not null, we need an IfOp, to write
382       // conditionally. Progressively lower to a 1-D transfer write.
383       conditionBuilder(inBoundsCondition,
384                        [&] { emitTransferWrite(majorIvsPlusOffsets); });
385     } else {
386       // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write.
387       emitTransferWrite(majorIvsPlusOffsets);
388     }
389   });
390 
391   rewriter.eraseOp(op);
392 
393   return success();
394 }
395 
396 } // namespace
397 
398 /// Analyzes the `transfer` to find an access dimension along the fastest remote
399 /// MemRef dimension. If such a dimension with coalescing properties is found,
400 /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of
401 /// LoopNestBuilder captures it in the innermost loop.
402 template <typename TransferOpTy>
403 static int computeCoalescedIndex(TransferOpTy transfer) {
404   // rank of the remote memory access, coalescing behavior occurs on the
405   // innermost memory dimension.
406   auto remoteRank = transfer.getMemRefType().getRank();
407   // Iterate over the results expressions of the permutation map to determine
408   // the loop order for creating pointwise copies between remote and local
409   // memories.
410   int coalescedIdx = -1;
411   auto exprs = transfer.permutation_map().getResults();
412   for (auto en : llvm::enumerate(exprs)) {
413     auto dim = en.value().template dyn_cast<AffineDimExpr>();
414     if (!dim) {
415       continue;
416     }
417     auto memRefDim = dim.getPosition();
418     if (memRefDim == remoteRank - 1) {
419       // memRefDim has coalescing properties, it should be swapped in the last
420       // position.
421       assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
422       coalescedIdx = en.index();
423     }
424   }
425   return coalescedIdx;
426 }
427 
428 /// Emits remote memory accesses that are clipped to the boundaries of the
429 /// MemRef.
430 template <typename TransferOpTy>
431 static SmallVector<Value, 8>
432 clip(TransferOpTy transfer, MemRefBoundsCapture &bounds, ArrayRef<Value> ivs) {
433   using namespace mlir::edsc;
434 
435   Value zero(std_constant_index(0)), one(std_constant_index(1));
436   SmallVector<Value, 8> memRefAccess(transfer.indices());
437   SmallVector<Value, 8> clippedScalarAccessExprs(memRefAccess.size());
438   // Indices accessing to remote memory are clipped and their expressions are
439   // returned in clippedScalarAccessExprs.
440   for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size();
441        ++memRefDim) {
442     // Linear search on a small number of entries.
443     int loopIndex = -1;
444     auto exprs = transfer.permutation_map().getResults();
445     for (auto en : llvm::enumerate(exprs)) {
446       auto expr = en.value();
447       auto dim = expr.template dyn_cast<AffineDimExpr>();
448       // Sanity check.
449       assert(
450           (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) &&
451           "Expected dim or 0 in permutationMap");
452       if (dim && memRefDim == dim.getPosition()) {
453         loopIndex = en.index();
454         break;
455       }
456     }
457 
458     // We cannot distinguish atm between unrolled dimensions that implement
459     // the "always full" tile abstraction and need clipping from the other
460     // ones. So we conservatively clip everything.
461     using namespace edsc::op;
462     auto N = bounds.ub(memRefDim);
463     auto i = memRefAccess[memRefDim];
464     if (loopIndex < 0) {
465       auto N_minus_1 = N - one;
466       auto select_1 = std_select(slt(i, N), i, N_minus_1);
467       clippedScalarAccessExprs[memRefDim] =
468           std_select(slt(i, zero), zero, select_1);
469     } else {
470       auto ii = ivs[loopIndex];
471       auto i_plus_ii = i + ii;
472       auto N_minus_1 = N - one;
473       auto select_1 = std_select(slt(i_plus_ii, N), i_plus_ii, N_minus_1);
474       clippedScalarAccessExprs[memRefDim] =
475           std_select(slt(i_plus_ii, zero), zero, select_1);
476     }
477   }
478 
479   return clippedScalarAccessExprs;
480 }
481 
482 namespace mlir {
483 
484 template <typename TransferOpTy>
485 VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter(
486     VectorTransferToSCFOptions options, MLIRContext *context)
487     : RewritePattern(TransferOpTy::getOperationName(), 1, context),
488       options(options) {}
489 
490 /// Used for staging the transfer in a local buffer.
491 template <typename TransferOpTy>
492 MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType(
493     TransferOpTy transfer) const {
494   auto vectorType = transfer.getVectorType();
495   return MemRefType::get(vectorType.getShape(), vectorType.getElementType(), {},
496                          0);
497 }
498 
499 /// Lowers TransferReadOp into a combination of:
500 ///   1. local memory allocation;
501 ///   2. perfect loop nest over:
502 ///      a. scalar load from local buffers (viewed as a scalar memref);
503 ///      a. scalar store to original memref (with clipping).
504 ///   3. vector_load from local buffer (viewed as a memref<1 x vector>);
505 ///   4. local memory deallocation.
506 ///
507 /// Lowers the data transfer part of a TransferReadOp while ensuring no
508 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
509 /// clipping. This means that a given value in memory can be read multiple
510 /// times and concurrently.
511 ///
512 /// Important notes about clipping and "full-tiles only" abstraction:
513 /// =================================================================
514 /// When using clipping for dealing with boundary conditions, the same edge
515 /// value will appear multiple times (a.k.a edge padding). This is fine if the
516 /// subsequent vector operations are all data-parallel but **is generally
517 /// incorrect** in the presence of reductions or extract operations.
518 ///
519 /// More generally, clipping is a scalar abstraction that is expected to work
520 /// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs.
521 /// To deal with real vector_load and DMAs, a "padded allocation + view"
522 /// abstraction with the ability to read out-of-memref-bounds (but still within
523 /// the allocated region) is necessary.
524 ///
525 /// Whether using scalar loops or vector_load/DMAs to perform the transfer,
526 /// junk values will be materialized in the vectors and generally need to be
527 /// filtered out and replaced by the "neutral element". This neutral element is
528 /// op-dependent so, in the future, we expect to create a vector filter and
529 /// apply it to a splatted constant vector with the proper neutral element at
530 /// each ssa-use. This filtering is not necessary for pure data-parallel
531 /// operations.
532 ///
533 /// In the case of vector_store/DMAs, Read-Modify-Write will be required, which
534 /// also have concurrency implications. Note that by using clipped scalar stores
535 /// in the presence of data-parallel only operations, we generate code that
536 /// writes the same value multiple time on the edge locations.
537 ///
538 /// TODO: implement alternatives to clipping.
539 /// TODO: support non-data-parallel operations.
540 
541 /// Performs the rewrite.
542 template <>
543 LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite(
544     Operation *op, PatternRewriter &rewriter) const {
545   using namespace mlir::edsc::op;
546 
547   TransferReadOp transfer = cast<TransferReadOp>(op);
548   if (transfer.permutation_map().isMinorIdentity()) {
549     // If > 1D, emit a bunch of loops around 1-D vector transfers.
550     if (transfer.getVectorType().getRank() > 1)
551       return NDTransferOpHelper<TransferReadOp>(rewriter, transfer, options)
552           .doReplace();
553     // If 1-D this is now handled by the target-specific lowering.
554     if (transfer.getVectorType().getRank() == 1)
555       return failure();
556   }
557 
558   // Conservative lowering to scalar load / stores.
559   // 1. Setup all the captures.
560   ScopedContext scope(rewriter, transfer.getLoc());
561   StdIndexedValue remote(transfer.memref());
562   MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
563   VectorBoundsCapture vectorBoundsCapture(transfer.vector());
564   int coalescedIdx = computeCoalescedIndex(transfer);
565   // Swap the vectorBoundsCapture which will reorder loop bounds.
566   if (coalescedIdx >= 0)
567     vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
568                                    coalescedIdx);
569 
570   auto lbs = vectorBoundsCapture.getLbs();
571   auto ubs = vectorBoundsCapture.getUbs();
572   SmallVector<Value, 8> steps;
573   steps.reserve(vectorBoundsCapture.getSteps().size());
574   for (auto step : vectorBoundsCapture.getSteps())
575     steps.push_back(std_constant_index(step));
576 
577   // 2. Emit alloc-copy-load-dealloc.
578   Value tmp = std_alloc(tmpMemRefType(transfer));
579   StdIndexedValue local(tmp);
580   Value vec = vector_type_cast(tmp);
581   loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
582     auto ivs = llvm::to_vector<8>(loopIvs);
583     // Swap the ivs which will reorder memory accesses.
584     if (coalescedIdx >= 0)
585       std::swap(ivs.back(), ivs[coalescedIdx]);
586     // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
587     local(ivs) = remote(clip(transfer, memRefBoundsCapture, ivs));
588   });
589   Value vectorValue = std_load(vec);
590   (std_dealloc(tmp)); // vexing parse
591 
592   // 3. Propagate.
593   rewriter.replaceOp(op, vectorValue);
594   return success();
595 }
596 
597 /// Lowers TransferWriteOp into a combination of:
598 ///   1. local memory allocation;
599 ///   2. vector_store to local buffer (viewed as a memref<1 x vector>);
600 ///   3. perfect loop nest over:
601 ///      a. scalar load from local buffers (viewed as a scalar memref);
602 ///      a. scalar store to original memref (with clipping).
603 ///   4. local memory deallocation.
604 ///
605 /// More specifically, lowers the data transfer part while ensuring no
606 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
607 /// clipping. This means that a given value in memory can be written to multiple
608 /// times and concurrently.
609 ///
610 /// See `Important notes about clipping and full-tiles only abstraction` in the
611 /// description of `readClipped` above.
612 ///
613 /// TODO: implement alternatives to clipping.
614 /// TODO: support non-data-parallel operations.
615 template <>
616 LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite(
617     Operation *op, PatternRewriter &rewriter) const {
618   using namespace edsc::op;
619 
620   TransferWriteOp transfer = cast<TransferWriteOp>(op);
621   if (transfer.permutation_map().isMinorIdentity()) {
622     // If > 1D, emit a bunch of loops around 1-D vector transfers.
623     if (transfer.getVectorType().getRank() > 1)
624       return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer, options)
625           .doReplace();
626     // If 1-D this is now handled by the target-specific lowering.
627     if (transfer.getVectorType().getRank() == 1)
628       return failure();
629   }
630 
631   // 1. Setup all the captures.
632   ScopedContext scope(rewriter, transfer.getLoc());
633   StdIndexedValue remote(transfer.memref());
634   MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
635   Value vectorValue(transfer.vector());
636   VectorBoundsCapture vectorBoundsCapture(transfer.vector());
637   int coalescedIdx = computeCoalescedIndex(transfer);
638   // Swap the vectorBoundsCapture which will reorder loop bounds.
639   if (coalescedIdx >= 0)
640     vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
641                                    coalescedIdx);
642 
643   auto lbs = vectorBoundsCapture.getLbs();
644   auto ubs = vectorBoundsCapture.getUbs();
645   SmallVector<Value, 8> steps;
646   steps.reserve(vectorBoundsCapture.getSteps().size());
647   for (auto step : vectorBoundsCapture.getSteps())
648     steps.push_back(std_constant_index(step));
649 
650   // 2. Emit alloc-store-copy-dealloc.
651   Value tmp = std_alloc(tmpMemRefType(transfer));
652   StdIndexedValue local(tmp);
653   Value vec = vector_type_cast(tmp);
654   std_store(vectorValue, vec);
655   loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
656     auto ivs = llvm::to_vector<8>(loopIvs);
657     // Swap the ivs which will reorder memory accesses.
658     if (coalescedIdx >= 0)
659       std::swap(ivs.back(), ivs[coalescedIdx]);
660     // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
661     remote(clip(transfer, memRefBoundsCapture, ivs)) = local(ivs);
662   });
663   (std_dealloc(tmp)); // vexing parse...
664 
665   rewriter.eraseOp(op);
666   return success();
667 }
668 
669 void populateVectorToSCFConversionPatterns(
670     OwningRewritePatternList &patterns, MLIRContext *context,
671     const VectorTransferToSCFOptions &options) {
672   patterns.insert<VectorTransferRewriter<vector::TransferReadOp>,
673                   VectorTransferRewriter<vector::TransferWriteOp>>(options,
674                                                                    context);
675 }
676 
677 } // namespace mlir
678 
679 namespace {
680 
681 struct ConvertVectorToSCFPass
682     : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
683   ConvertVectorToSCFPass() = default;
684   ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
685     this->fullUnroll = options.unroll;
686   }
687 
688   void runOnFunction() override {
689     OwningRewritePatternList patterns;
690     auto *context = getFunction().getContext();
691     populateVectorToSCFConversionPatterns(
692         patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll));
693     applyPatternsAndFoldGreedily(getFunction(), patterns);
694   }
695 };
696 
697 } // namespace
698 
699 std::unique_ptr<Pass>
700 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
701   return std::make_unique<ConvertVectorToSCFPass>(options);
702 }
703