1 //===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements target-dependent lowering of vector transfer operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <type_traits>
14 
15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
16 
17 #include "../PassDetail.h"
18 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
19 #include "mlir/Dialect/Linalg/Utils/Utils.h"
20 #include "mlir/Dialect/SCF/EDSC/Builders.h"
21 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h"
22 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
23 #include "mlir/Dialect/Vector/EDSC/Intrinsics.h"
24 #include "mlir/Dialect/Vector/VectorOps.h"
25 #include "mlir/Dialect/Vector/VectorUtils.h"
26 #include "mlir/IR/AffineExpr.h"
27 #include "mlir/IR/AffineMap.h"
28 #include "mlir/IR/Attributes.h"
29 #include "mlir/IR/Builders.h"
30 #include "mlir/IR/Location.h"
31 #include "mlir/IR/Matchers.h"
32 #include "mlir/IR/OperationSupport.h"
33 #include "mlir/IR/PatternMatch.h"
34 #include "mlir/IR/Types.h"
35 #include "mlir/Pass/Pass.h"
36 #include "mlir/Transforms/Passes.h"
37 
38 #define ALIGNMENT_SIZE 128
39 
40 using namespace mlir;
41 using namespace mlir::edsc;
42 using namespace mlir::edsc::intrinsics;
43 using vector::TransferReadOp;
44 using vector::TransferWriteOp;
45 
46 namespace {
47 /// Helper class captures the common information needed to lower N>1-D vector
48 /// transfer operations (read and write).
49 /// On construction, this class opens an edsc::ScopedContext for simpler IR
50 /// manipulation.
51 /// In pseudo-IR, for an n-D vector_transfer_read such as:
52 ///
53 /// ```
54 ///   vector_transfer_read(%m, %offsets, identity_map, %fill) :
55 ///     memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
56 ///     vector<(major_dims) x (minor_dims) x type>
57 /// ```
58 ///
59 /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or
60 /// higher).
61 ///
62 /// This is the entry point to emitting pseudo-IR resembling:
63 ///
64 /// ```
65 ///   %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>>
66 ///   for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest
67 ///     if (any_of(%ivs_major + %offsets, <, major_dims)) {
68 ///       %v = vector_transfer_read(
69 ///         {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor},
70 ///          %ivs_minor):
71 ///         memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
72 ///         vector<(minor_dims) x type>;
73 ///       store(%v, %tmp);
74 ///     } else {
75 ///       %v = splat(vector<(minor_dims) x type>, %fill)
76 ///       store(%v, %tmp, %ivs_major);
77 ///     }
78 ///   }
79 ///   %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>):
80 //      vector<(major_dims) x (minor_dims) x type>
81 /// ```
82 ///
83 template <typename ConcreteOp>
84 class NDTransferOpHelper {
85 public:
86   NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp,
87                      const VectorTransferToSCFOptions &options)
88       : rewriter(rewriter), options(options), loc(xferOp.getLoc()),
89         scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp),
90         op(xferOp.getOperation()) {
91     vectorType = xferOp.getVectorType();
92     // TODO: when we go to k > 1-D vectors adapt minorRank.
93     minorRank = 1;
94     majorRank = vectorType.getRank() - minorRank;
95     leadingRank = xferOp.getLeadingMemRefRank();
96     majorVectorType =
97         VectorType::get(vectorType.getShape().take_front(majorRank),
98                         vectorType.getElementType());
99     minorVectorType =
100         VectorType::get(vectorType.getShape().take_back(minorRank),
101                         vectorType.getElementType());
102     /// Memref of minor vector type is used for individual transfers.
103     memRefMinorVectorType =
104         MemRefType::get(majorVectorType.getShape(), minorVectorType, {},
105                         xferOp.getMemRefType().getMemorySpace());
106   }
107 
108   LogicalResult doReplace();
109 
110 private:
111   /// Creates the loop nest on the "major" dimensions and calls the
112   /// `loopBodyBuilder` lambda in the context of the loop nest.
113   template <typename Lambda>
114   void emitLoops(Lambda loopBodyBuilder);
115 
116   /// Operate within the body of `emitLoops` to:
117   ///   1. Compute the indexings `majorIvs + majorOffsets` and save them in
118   ///      `majorIvsPlusOffsets`.
119   ///   2. Return a boolean that determines whether the first `majorIvs.rank()`
120   ///      dimensions `majorIvs + majorOffsets` are all within `memrefBounds`.
121   Value emitInBoundsCondition(ValueRange majorIvs, ValueRange majorOffsets,
122                               MemRefBoundsCapture &memrefBounds,
123                               SmallVectorImpl<Value> &majorIvsPlusOffsets);
124 
125   /// Common state to lower vector transfer ops.
126   PatternRewriter &rewriter;
127   const VectorTransferToSCFOptions &options;
128   Location loc;
129   std::unique_ptr<ScopedContext> scope;
130   ConcreteOp xferOp;
131   Operation *op;
132   // A vector transfer copies data between:
133   //   - memref<(leading_dims) x (major_dims) x (minor_dims) x type>
134   //   - vector<(major_dims) x (minor_dims) x type>
135   unsigned minorRank;         // for now always 1
136   unsigned majorRank;         // vector rank - minorRank
137   unsigned leadingRank;       // memref rank - vector rank
138   VectorType vectorType;      // vector<(major_dims) x (minor_dims) x type>
139   VectorType majorVectorType; // vector<(major_dims) x type>
140   VectorType minorVectorType; // vector<(minor_dims) x type>
141   MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>>
142 };
143 
144 template <typename ConcreteOp>
145 template <typename Lambda>
146 void NDTransferOpHelper<ConcreteOp>::emitLoops(Lambda loopBodyBuilder) {
147   /// Loop nest operates on the major dimensions
148   MemRefBoundsCapture memrefBoundsCapture(xferOp.memref());
149 
150   if (options.unroll) {
151     auto shape = majorVectorType.getShape();
152     auto strides = computeStrides(shape);
153     unsigned numUnrolledInstances = computeMaxLinearIndex(shape);
154     ValueRange indices(xferOp.indices());
155     for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) {
156       SmallVector<int64_t, 4> offsets = delinearize(strides, idx);
157       SmallVector<Value, 4> offsetValues =
158           llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value {
159             return std_constant_index(off);
160           }));
161       loopBodyBuilder(offsetValues, indices.take_front(leadingRank),
162                       indices.drop_front(leadingRank).take_front(majorRank),
163                       indices.take_back(minorRank), memrefBoundsCapture);
164     }
165   } else {
166     VectorBoundsCapture vectorBoundsCapture(majorVectorType);
167     auto majorLbs = vectorBoundsCapture.getLbs();
168     auto majorUbs = vectorBoundsCapture.getUbs();
169     auto majorSteps = vectorBoundsCapture.getSteps();
170     affineLoopNestBuilder(
171         majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) {
172           ValueRange indices(xferOp.indices());
173           loopBodyBuilder(majorIvs, indices.take_front(leadingRank),
174                           indices.drop_front(leadingRank).take_front(majorRank),
175                           indices.take_back(minorRank), memrefBoundsCapture);
176         });
177   }
178 }
179 
180 static Optional<int64_t> extractConstantIndex(Value v) {
181   if (auto cstOp = v.getDefiningOp<ConstantIndexOp>())
182     return cstOp.getValue();
183   if (auto affineApplyOp = v.getDefiningOp<AffineApplyOp>())
184     if (affineApplyOp.getAffineMap().isSingleConstant())
185       return affineApplyOp.getAffineMap().getSingleConstantResult();
186   return None;
187 }
188 
189 // Missing foldings of scf.if make it necessary to perform poor man's folding
190 // eagerly, especially in the case of unrolling. In the future, this should go
191 // away once scf.if folds properly.
192 static Value onTheFlyFoldSLT(Value v, Value ub) {
193   using namespace mlir::edsc::op;
194   auto maybeCstV = extractConstantIndex(v);
195   auto maybeCstUb = extractConstantIndex(ub);
196   if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb)
197     return Value();
198   return slt(v, ub);
199 }
200 
201 template <typename ConcreteOp>
202 Value NDTransferOpHelper<ConcreteOp>::emitInBoundsCondition(
203     ValueRange majorIvs, ValueRange majorOffsets,
204     MemRefBoundsCapture &memrefBounds,
205     SmallVectorImpl<Value> &majorIvsPlusOffsets) {
206   Value inBoundsCondition;
207   majorIvsPlusOffsets.reserve(majorIvs.size());
208   unsigned idx = 0;
209   SmallVector<Value, 4> bounds =
210       linalg::applyMapToValues(rewriter, xferOp.getLoc(),
211                                xferOp.permutation_map(), memrefBounds.getUbs());
212   for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) {
213     Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it);
214     using namespace mlir::edsc::op;
215     majorIvsPlusOffsets.push_back(iv + off);
216     if (xferOp.isMaskedDim(leadingRank + idx)) {
217       Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub);
218       if (inBoundsCond)
219         inBoundsCondition = (inBoundsCondition)
220                                 ? (inBoundsCondition && inBoundsCond)
221                                 : inBoundsCond;
222     }
223     ++idx;
224   }
225   return inBoundsCondition;
226 }
227 
228 // TODO: Parallelism and threadlocal considerations.
229 static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType,
230                                      Operation *op) {
231   auto &b = ScopedContext::getBuilderRef();
232   OpBuilder::InsertionGuard guard(b);
233   Operation *scope =
234       op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
235   assert(scope && "Expected op to be inside automatic allocation scope");
236   b.setInsertionPointToStart(&scope->getRegion(0).front());
237   Value res = std_alloca(memRefMinorVectorType, ValueRange{},
238                          b.getI64IntegerAttr(ALIGNMENT_SIZE));
239   return res;
240 }
241 
242 template <>
243 LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
244   Value alloc, result;
245   if (options.unroll)
246     result = std_splat(vectorType, xferOp.padding());
247   else
248     alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
249 
250   emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
251                 ValueRange majorOffsets, ValueRange minorOffsets,
252                 MemRefBoundsCapture &memrefBounds) {
253     /// Lambda to load 1-D vector in the current loop ivs + offset context.
254     auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value {
255       SmallVector<Value, 8> indexing;
256       indexing.reserve(leadingRank + majorRank + minorRank);
257       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
258       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
259       indexing.append(minorOffsets.begin(), minorOffsets.end());
260       Value memref = xferOp.memref();
261       auto map =
262           getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType);
263       ArrayAttr masked;
264       if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
265         OpBuilder &b = ScopedContext::getBuilderRef();
266         masked = b.getBoolArrayAttr({false});
267       }
268       return vector_transfer_read(minorVectorType, memref, indexing,
269                                   AffineMapAttr::get(map), xferOp.padding(),
270                                   masked);
271     };
272 
273     // 1. Compute the inBoundsCondition in the current loops ivs + offset
274     // context.
275     SmallVector<Value, 4> majorIvsPlusOffsets;
276     Value inBoundsCondition = emitInBoundsCondition(
277         majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
278 
279     if (inBoundsCondition) {
280       // 2. If the condition is not null, we need an IfOp, which may yield
281       // if `options.unroll` is true.
282       SmallVector<Type, 1> resultType;
283       if (options.unroll)
284         resultType.push_back(vectorType);
285 
286       // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise
287       // splat a 1-D vector.
288       ValueRange ifResults = conditionBuilder(
289           resultType, inBoundsCondition,
290           [&]() -> scf::ValueVector {
291             Value vector = load1DVector(majorIvsPlusOffsets);
292             // 3.a. If `options.unroll` is true, insert the 1-D vector in the
293             // aggregate. We must yield and merge with the `else` branch.
294             if (options.unroll) {
295               vector = vector_insert(vector, result, majorIvs);
296               return {vector};
297             }
298             // 3.b. Otherwise, just go through the temporary `alloc`.
299             std_store(vector, alloc, majorIvs);
300             return {};
301           },
302           [&]() -> scf::ValueVector {
303             Value vector = std_splat(minorVectorType, xferOp.padding());
304             // 3.c. If `options.unroll` is true, insert the 1-D vector in the
305             // aggregate. We must yield and merge with the `then` branch.
306             if (options.unroll) {
307               vector = vector_insert(vector, result, majorIvs);
308               return {vector};
309             }
310             // 3.d. Otherwise, just go through the temporary `alloc`.
311             std_store(vector, alloc, majorIvs);
312             return {};
313           });
314 
315       if (!resultType.empty())
316         result = *ifResults.begin();
317     } else {
318       // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read.
319       Value loaded1D = load1DVector(majorIvsPlusOffsets);
320       // 5.a. If `options.unroll` is true, insert the 1-D vector in the
321       // aggregate.
322       if (options.unroll)
323         result = vector_insert(loaded1D, result, majorIvs);
324       // 5.b. Otherwise, just go through the temporary `alloc`.
325       else
326         std_store(loaded1D, alloc, majorIvs);
327     }
328   });
329 
330   assert((!options.unroll ^ (bool)result) &&
331          "Expected resulting Value iff unroll");
332   if (!result)
333     result = std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc));
334   rewriter.replaceOp(op, result);
335 
336   return success();
337 }
338 
339 template <>
340 LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
341   Value alloc;
342   if (!options.unroll) {
343     alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
344     std_store(xferOp.vector(),
345               vector_type_cast(MemRefType::get({}, vectorType), alloc));
346   }
347 
348   emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
349                 ValueRange majorOffsets, ValueRange minorOffsets,
350                 MemRefBoundsCapture &memrefBounds) {
351     // Lower to 1-D vector_transfer_write and let recursion handle it.
352     auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) {
353       SmallVector<Value, 8> indexing;
354       indexing.reserve(leadingRank + majorRank + minorRank);
355       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
356       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
357       indexing.append(minorOffsets.begin(), minorOffsets.end());
358       Value result;
359       // If `options.unroll` is true, extract the 1-D vector from the
360       // aggregate.
361       if (options.unroll)
362         result = vector_extract(xferOp.vector(), majorIvs);
363       else
364         result = std_load(alloc, majorIvs);
365       auto map =
366           getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType);
367       ArrayAttr masked;
368       if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
369         OpBuilder &b = ScopedContext::getBuilderRef();
370         masked = b.getBoolArrayAttr({false});
371       }
372       vector_transfer_write(result, xferOp.memref(), indexing,
373                             AffineMapAttr::get(map), masked);
374     };
375 
376     // 1. Compute the inBoundsCondition in the current loops ivs + offset
377     // context.
378     SmallVector<Value, 4> majorIvsPlusOffsets;
379     Value inBoundsCondition = emitInBoundsCondition(
380         majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
381 
382     if (inBoundsCondition) {
383       // 2.a. If the condition is not null, we need an IfOp, to write
384       // conditionally. Progressively lower to a 1-D transfer write.
385       conditionBuilder(inBoundsCondition,
386                        [&] { emitTransferWrite(majorIvsPlusOffsets); });
387     } else {
388       // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write.
389       emitTransferWrite(majorIvsPlusOffsets);
390     }
391   });
392 
393   rewriter.eraseOp(op);
394 
395   return success();
396 }
397 
398 } // namespace
399 
400 /// Analyzes the `transfer` to find an access dimension along the fastest remote
401 /// MemRef dimension. If such a dimension with coalescing properties is found,
402 /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of
403 /// LoopNestBuilder captures it in the innermost loop.
404 template <typename TransferOpTy>
405 static int computeCoalescedIndex(TransferOpTy transfer) {
406   // rank of the remote memory access, coalescing behavior occurs on the
407   // innermost memory dimension.
408   auto remoteRank = transfer.getMemRefType().getRank();
409   // Iterate over the results expressions of the permutation map to determine
410   // the loop order for creating pointwise copies between remote and local
411   // memories.
412   int coalescedIdx = -1;
413   auto exprs = transfer.permutation_map().getResults();
414   for (auto en : llvm::enumerate(exprs)) {
415     auto dim = en.value().template dyn_cast<AffineDimExpr>();
416     if (!dim) {
417       continue;
418     }
419     auto memRefDim = dim.getPosition();
420     if (memRefDim == remoteRank - 1) {
421       // memRefDim has coalescing properties, it should be swapped in the last
422       // position.
423       assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
424       coalescedIdx = en.index();
425     }
426   }
427   return coalescedIdx;
428 }
429 
430 /// Emits remote memory accesses that are clipped to the boundaries of the
431 /// MemRef.
432 template <typename TransferOpTy>
433 static SmallVector<Value, 8>
434 clip(TransferOpTy transfer, MemRefBoundsCapture &bounds, ArrayRef<Value> ivs) {
435   using namespace mlir::edsc;
436 
437   Value zero(std_constant_index(0)), one(std_constant_index(1));
438   SmallVector<Value, 8> memRefAccess(transfer.indices());
439   SmallVector<Value, 8> clippedScalarAccessExprs(memRefAccess.size());
440   // Indices accessing to remote memory are clipped and their expressions are
441   // returned in clippedScalarAccessExprs.
442   for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size();
443        ++memRefDim) {
444     // Linear search on a small number of entries.
445     int loopIndex = -1;
446     auto exprs = transfer.permutation_map().getResults();
447     for (auto en : llvm::enumerate(exprs)) {
448       auto expr = en.value();
449       auto dim = expr.template dyn_cast<AffineDimExpr>();
450       // Sanity check.
451       assert(
452           (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) &&
453           "Expected dim or 0 in permutationMap");
454       if (dim && memRefDim == dim.getPosition()) {
455         loopIndex = en.index();
456         break;
457       }
458     }
459 
460     // We cannot distinguish atm between unrolled dimensions that implement
461     // the "always full" tile abstraction and need clipping from the other
462     // ones. So we conservatively clip everything.
463     using namespace edsc::op;
464     auto N = bounds.ub(memRefDim);
465     auto i = memRefAccess[memRefDim];
466     if (loopIndex < 0) {
467       auto N_minus_1 = N - one;
468       auto select_1 = std_select(slt(i, N), i, N_minus_1);
469       clippedScalarAccessExprs[memRefDim] =
470           std_select(slt(i, zero), zero, select_1);
471     } else {
472       auto ii = ivs[loopIndex];
473       auto i_plus_ii = i + ii;
474       auto N_minus_1 = N - one;
475       auto select_1 = std_select(slt(i_plus_ii, N), i_plus_ii, N_minus_1);
476       clippedScalarAccessExprs[memRefDim] =
477           std_select(slt(i_plus_ii, zero), zero, select_1);
478     }
479   }
480 
481   return clippedScalarAccessExprs;
482 }
483 
484 namespace mlir {
485 
486 template <typename TransferOpTy>
487 VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter(
488     VectorTransferToSCFOptions options, MLIRContext *context)
489     : RewritePattern(TransferOpTy::getOperationName(), 1, context),
490       options(options) {}
491 
492 /// Used for staging the transfer in a local buffer.
493 template <typename TransferOpTy>
494 MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType(
495     TransferOpTy transfer) const {
496   auto vectorType = transfer.getVectorType();
497   return MemRefType::get(vectorType.getShape(), vectorType.getElementType(), {},
498                          0);
499 }
500 
501 /// Lowers TransferReadOp into a combination of:
502 ///   1. local memory allocation;
503 ///   2. perfect loop nest over:
504 ///      a. scalar load from local buffers (viewed as a scalar memref);
505 ///      a. scalar store to original memref (with clipping).
506 ///   3. vector_load from local buffer (viewed as a memref<1 x vector>);
507 ///   4. local memory deallocation.
508 ///
509 /// Lowers the data transfer part of a TransferReadOp while ensuring no
510 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
511 /// clipping. This means that a given value in memory can be read multiple
512 /// times and concurrently.
513 ///
514 /// Important notes about clipping and "full-tiles only" abstraction:
515 /// =================================================================
516 /// When using clipping for dealing with boundary conditions, the same edge
517 /// value will appear multiple times (a.k.a edge padding). This is fine if the
518 /// subsequent vector operations are all data-parallel but **is generally
519 /// incorrect** in the presence of reductions or extract operations.
520 ///
521 /// More generally, clipping is a scalar abstraction that is expected to work
522 /// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs.
523 /// To deal with real vector_load and DMAs, a "padded allocation + view"
524 /// abstraction with the ability to read out-of-memref-bounds (but still within
525 /// the allocated region) is necessary.
526 ///
527 /// Whether using scalar loops or vector_load/DMAs to perform the transfer,
528 /// junk values will be materialized in the vectors and generally need to be
529 /// filtered out and replaced by the "neutral element". This neutral element is
530 /// op-dependent so, in the future, we expect to create a vector filter and
531 /// apply it to a splatted constant vector with the proper neutral element at
532 /// each ssa-use. This filtering is not necessary for pure data-parallel
533 /// operations.
534 ///
535 /// In the case of vector_store/DMAs, Read-Modify-Write will be required, which
536 /// also have concurrency implications. Note that by using clipped scalar stores
537 /// in the presence of data-parallel only operations, we generate code that
538 /// writes the same value multiple time on the edge locations.
539 ///
540 /// TODO: implement alternatives to clipping.
541 /// TODO: support non-data-parallel operations.
542 
543 /// Performs the rewrite.
544 template <>
545 LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite(
546     Operation *op, PatternRewriter &rewriter) const {
547   using namespace mlir::edsc::op;
548 
549   TransferReadOp transfer = cast<TransferReadOp>(op);
550   if (transfer.permutation_map().isMinorIdentity()) {
551     // If > 1D, emit a bunch of loops around 1-D vector transfers.
552     if (transfer.getVectorType().getRank() > 1)
553       return NDTransferOpHelper<TransferReadOp>(rewriter, transfer, options)
554           .doReplace();
555     // If 1-D this is now handled by the target-specific lowering.
556     if (transfer.getVectorType().getRank() == 1)
557       return failure();
558   }
559 
560   // Conservative lowering to scalar load / stores.
561   // 1. Setup all the captures.
562   ScopedContext scope(rewriter, transfer.getLoc());
563   StdIndexedValue remote(transfer.memref());
564   MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
565   VectorBoundsCapture vectorBoundsCapture(transfer.vector());
566   int coalescedIdx = computeCoalescedIndex(transfer);
567   // Swap the vectorBoundsCapture which will reorder loop bounds.
568   if (coalescedIdx >= 0)
569     vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
570                                    coalescedIdx);
571 
572   auto lbs = vectorBoundsCapture.getLbs();
573   auto ubs = vectorBoundsCapture.getUbs();
574   SmallVector<Value, 8> steps;
575   steps.reserve(vectorBoundsCapture.getSteps().size());
576   for (auto step : vectorBoundsCapture.getSteps())
577     steps.push_back(std_constant_index(step));
578 
579   // 2. Emit alloc-copy-load-dealloc.
580   Value tmp = std_alloc(tmpMemRefType(transfer), ValueRange{},
581                         rewriter.getI64IntegerAttr(ALIGNMENT_SIZE));
582   StdIndexedValue local(tmp);
583   Value vec = vector_type_cast(tmp);
584   loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
585     auto ivs = llvm::to_vector<8>(loopIvs);
586     // Swap the ivs which will reorder memory accesses.
587     if (coalescedIdx >= 0)
588       std::swap(ivs.back(), ivs[coalescedIdx]);
589     // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
590     local(ivs) = remote(clip(transfer, memRefBoundsCapture, ivs));
591   });
592   Value vectorValue = std_load(vec);
593   (std_dealloc(tmp)); // vexing parse
594 
595   // 3. Propagate.
596   rewriter.replaceOp(op, vectorValue);
597   return success();
598 }
599 
600 /// Lowers TransferWriteOp into a combination of:
601 ///   1. local memory allocation;
602 ///   2. vector_store to local buffer (viewed as a memref<1 x vector>);
603 ///   3. perfect loop nest over:
604 ///      a. scalar load from local buffers (viewed as a scalar memref);
605 ///      a. scalar store to original memref (with clipping).
606 ///   4. local memory deallocation.
607 ///
608 /// More specifically, lowers the data transfer part while ensuring no
609 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
610 /// clipping. This means that a given value in memory can be written to multiple
611 /// times and concurrently.
612 ///
613 /// See `Important notes about clipping and full-tiles only abstraction` in the
614 /// description of `readClipped` above.
615 ///
616 /// TODO: implement alternatives to clipping.
617 /// TODO: support non-data-parallel operations.
618 template <>
619 LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite(
620     Operation *op, PatternRewriter &rewriter) const {
621   using namespace edsc::op;
622 
623   TransferWriteOp transfer = cast<TransferWriteOp>(op);
624   if (transfer.permutation_map().isMinorIdentity()) {
625     // If > 1D, emit a bunch of loops around 1-D vector transfers.
626     if (transfer.getVectorType().getRank() > 1)
627       return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer, options)
628           .doReplace();
629     // If 1-D this is now handled by the target-specific lowering.
630     if (transfer.getVectorType().getRank() == 1)
631       return failure();
632   }
633 
634   // 1. Setup all the captures.
635   ScopedContext scope(rewriter, transfer.getLoc());
636   StdIndexedValue remote(transfer.memref());
637   MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
638   Value vectorValue(transfer.vector());
639   VectorBoundsCapture vectorBoundsCapture(transfer.vector());
640   int coalescedIdx = computeCoalescedIndex(transfer);
641   // Swap the vectorBoundsCapture which will reorder loop bounds.
642   if (coalescedIdx >= 0)
643     vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
644                                    coalescedIdx);
645 
646   auto lbs = vectorBoundsCapture.getLbs();
647   auto ubs = vectorBoundsCapture.getUbs();
648   SmallVector<Value, 8> steps;
649   steps.reserve(vectorBoundsCapture.getSteps().size());
650   for (auto step : vectorBoundsCapture.getSteps())
651     steps.push_back(std_constant_index(step));
652 
653   // 2. Emit alloc-store-copy-dealloc.
654   Value tmp = std_alloc(tmpMemRefType(transfer), ValueRange{},
655                         rewriter.getI64IntegerAttr(ALIGNMENT_SIZE));
656   StdIndexedValue local(tmp);
657   Value vec = vector_type_cast(tmp);
658   std_store(vectorValue, vec);
659   loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
660     auto ivs = llvm::to_vector<8>(loopIvs);
661     // Swap the ivs which will reorder memory accesses.
662     if (coalescedIdx >= 0)
663       std::swap(ivs.back(), ivs[coalescedIdx]);
664     // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
665     remote(clip(transfer, memRefBoundsCapture, ivs)) = local(ivs);
666   });
667   (std_dealloc(tmp)); // vexing parse...
668 
669   rewriter.eraseOp(op);
670   return success();
671 }
672 
673 void populateVectorToSCFConversionPatterns(
674     OwningRewritePatternList &patterns, MLIRContext *context,
675     const VectorTransferToSCFOptions &options) {
676   patterns.insert<VectorTransferRewriter<vector::TransferReadOp>,
677                   VectorTransferRewriter<vector::TransferWriteOp>>(options,
678                                                                    context);
679 }
680 
681 } // namespace mlir
682 
683 namespace {
684 
685 struct ConvertVectorToSCFPass
686     : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
687   ConvertVectorToSCFPass() = default;
688   ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
689     this->fullUnroll = options.unroll;
690   }
691 
692   void runOnFunction() override {
693     OwningRewritePatternList patterns;
694     auto *context = getFunction().getContext();
695     populateVectorToSCFConversionPatterns(
696         patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll));
697     applyPatternsAndFoldGreedily(getFunction(), patterns);
698   }
699 };
700 
701 } // namespace
702 
703 std::unique_ptr<Pass>
704 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
705   return std::make_unique<ConvertVectorToSCFPass>(options);
706 }
707