1*4ead2cf7SAlex Zinenko //===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===//
2*4ead2cf7SAlex Zinenko //
3*4ead2cf7SAlex Zinenko // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*4ead2cf7SAlex Zinenko // See https://llvm.org/LICENSE.txt for license information.
5*4ead2cf7SAlex Zinenko // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*4ead2cf7SAlex Zinenko //
7*4ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===//
8*4ead2cf7SAlex Zinenko //
9*4ead2cf7SAlex Zinenko // This file implements target-dependent lowering of vector transfer operations.
10*4ead2cf7SAlex Zinenko //
11*4ead2cf7SAlex Zinenko //===----------------------------------------------------------------------===//
12*4ead2cf7SAlex Zinenko 
13*4ead2cf7SAlex Zinenko #include <type_traits>
14*4ead2cf7SAlex Zinenko 
15*4ead2cf7SAlex Zinenko #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
16*4ead2cf7SAlex Zinenko #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
17*4ead2cf7SAlex Zinenko #include "mlir/Dialect/SCF/EDSC/Builders.h"
18*4ead2cf7SAlex Zinenko #include "mlir/Dialect/SCF/EDSC/Intrinsics.h"
19*4ead2cf7SAlex Zinenko #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
20*4ead2cf7SAlex Zinenko #include "mlir/Dialect/Vector/EDSC/Intrinsics.h"
21*4ead2cf7SAlex Zinenko #include "mlir/Dialect/Vector/VectorOps.h"
22*4ead2cf7SAlex Zinenko #include "mlir/IR/AffineExpr.h"
23*4ead2cf7SAlex Zinenko #include "mlir/IR/AffineMap.h"
24*4ead2cf7SAlex Zinenko #include "mlir/IR/Attributes.h"
25*4ead2cf7SAlex Zinenko #include "mlir/IR/Builders.h"
26*4ead2cf7SAlex Zinenko #include "mlir/IR/Location.h"
27*4ead2cf7SAlex Zinenko #include "mlir/IR/Matchers.h"
28*4ead2cf7SAlex Zinenko #include "mlir/IR/OperationSupport.h"
29*4ead2cf7SAlex Zinenko #include "mlir/IR/PatternMatch.h"
30*4ead2cf7SAlex Zinenko #include "mlir/IR/Types.h"
31*4ead2cf7SAlex Zinenko 
32*4ead2cf7SAlex Zinenko using namespace mlir;
33*4ead2cf7SAlex Zinenko using namespace mlir::edsc;
34*4ead2cf7SAlex Zinenko using namespace mlir::edsc::intrinsics;
35*4ead2cf7SAlex Zinenko using vector::TransferReadOp;
36*4ead2cf7SAlex Zinenko using vector::TransferWriteOp;
37*4ead2cf7SAlex Zinenko 
38*4ead2cf7SAlex Zinenko /// Helper class captures the common information needed to lower N>1-D vector
39*4ead2cf7SAlex Zinenko /// transfer operations (read and write).
40*4ead2cf7SAlex Zinenko /// On construction, this class opens an edsc::ScopedContext for simpler IR
41*4ead2cf7SAlex Zinenko /// manipulation.
42*4ead2cf7SAlex Zinenko /// In pseudo-IR, for an n-D vector_transfer_read such as:
43*4ead2cf7SAlex Zinenko ///
44*4ead2cf7SAlex Zinenko /// ```
45*4ead2cf7SAlex Zinenko ///   vector_transfer_read(%m, %offsets, identity_map, %fill) :
46*4ead2cf7SAlex Zinenko ///     memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
47*4ead2cf7SAlex Zinenko ///     vector<(major_dims) x (minor_dims) x type>
48*4ead2cf7SAlex Zinenko /// ```
49*4ead2cf7SAlex Zinenko ///
50*4ead2cf7SAlex Zinenko /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or
51*4ead2cf7SAlex Zinenko /// higher).
52*4ead2cf7SAlex Zinenko ///
53*4ead2cf7SAlex Zinenko /// This is the entry point to emitting pseudo-IR resembling:
54*4ead2cf7SAlex Zinenko ///
55*4ead2cf7SAlex Zinenko /// ```
56*4ead2cf7SAlex Zinenko ///   %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>>
57*4ead2cf7SAlex Zinenko ///   for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest
58*4ead2cf7SAlex Zinenko ///     if (any_of(%ivs_major + %offsets, <, major_dims)) {
59*4ead2cf7SAlex Zinenko ///       %v = vector_transfer_read(
60*4ead2cf7SAlex Zinenko ///         {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor},
61*4ead2cf7SAlex Zinenko ///          %ivs_minor):
62*4ead2cf7SAlex Zinenko ///         memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
63*4ead2cf7SAlex Zinenko ///         vector<(minor_dims) x type>;
64*4ead2cf7SAlex Zinenko ///       store(%v, %tmp);
65*4ead2cf7SAlex Zinenko ///     } else {
66*4ead2cf7SAlex Zinenko ///       %v = splat(vector<(minor_dims) x type>, %fill)
67*4ead2cf7SAlex Zinenko ///       store(%v, %tmp, %ivs_major);
68*4ead2cf7SAlex Zinenko ///     }
69*4ead2cf7SAlex Zinenko ///   }
70*4ead2cf7SAlex Zinenko ///   %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>):
71*4ead2cf7SAlex Zinenko //      vector<(major_dims) x (minor_dims) x type>
72*4ead2cf7SAlex Zinenko /// ```
73*4ead2cf7SAlex Zinenko ///
74*4ead2cf7SAlex Zinenko template <typename ConcreteOp>
75*4ead2cf7SAlex Zinenko class NDTransferOpHelper {
76*4ead2cf7SAlex Zinenko public:
77*4ead2cf7SAlex Zinenko   NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp)
78*4ead2cf7SAlex Zinenko       : rewriter(rewriter), loc(xferOp.getLoc()),
79*4ead2cf7SAlex Zinenko         scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp),
80*4ead2cf7SAlex Zinenko         op(xferOp.getOperation()) {
81*4ead2cf7SAlex Zinenko     vectorType = xferOp.getVectorType();
82*4ead2cf7SAlex Zinenko     // TODO(ntv, ajcbik): when we go to k > 1-D vectors adapt minorRank.
83*4ead2cf7SAlex Zinenko     minorRank = 1;
84*4ead2cf7SAlex Zinenko     majorRank = vectorType.getRank() - minorRank;
85*4ead2cf7SAlex Zinenko     leadingRank = xferOp.getMemRefType().getRank() - (majorRank + minorRank);
86*4ead2cf7SAlex Zinenko     majorVectorType =
87*4ead2cf7SAlex Zinenko         VectorType::get(vectorType.getShape().take_front(majorRank),
88*4ead2cf7SAlex Zinenko                         vectorType.getElementType());
89*4ead2cf7SAlex Zinenko     minorVectorType =
90*4ead2cf7SAlex Zinenko         VectorType::get(vectorType.getShape().take_back(minorRank),
91*4ead2cf7SAlex Zinenko                         vectorType.getElementType());
92*4ead2cf7SAlex Zinenko     /// Memref of minor vector type is used for individual transfers.
93*4ead2cf7SAlex Zinenko     memRefMinorVectorType =
94*4ead2cf7SAlex Zinenko         MemRefType::get(majorVectorType.getShape(), minorVectorType, {},
95*4ead2cf7SAlex Zinenko                         xferOp.getMemRefType().getMemorySpace());
96*4ead2cf7SAlex Zinenko   }
97*4ead2cf7SAlex Zinenko 
98*4ead2cf7SAlex Zinenko   LogicalResult doReplace();
99*4ead2cf7SAlex Zinenko 
100*4ead2cf7SAlex Zinenko private:
101*4ead2cf7SAlex Zinenko   /// Creates the loop nest on the "major" dimensions and calls the
102*4ead2cf7SAlex Zinenko   /// `loopBodyBuilder` lambda in the context of the loop nest.
103*4ead2cf7SAlex Zinenko   template <typename Lambda>
104*4ead2cf7SAlex Zinenko   void emitLoops(Lambda loopBodyBuilder);
105*4ead2cf7SAlex Zinenko 
106*4ead2cf7SAlex Zinenko   /// Operate within the body of `emitLoops` to:
107*4ead2cf7SAlex Zinenko   ///   1. Compute the indexings `majorIvs + majorOffsets`.
108*4ead2cf7SAlex Zinenko   ///   2. Compute a boolean that determines whether the first `majorIvs.rank()`
109*4ead2cf7SAlex Zinenko   ///      dimensions `majorIvs + majorOffsets` are all within `memrefBounds`.
110*4ead2cf7SAlex Zinenko   ///   3. Create an IfOp conditioned on the boolean in step 2.
111*4ead2cf7SAlex Zinenko   ///   4. Call a `thenBlockBuilder` and an `elseBlockBuilder` to append
112*4ead2cf7SAlex Zinenko   ///      operations to the IfOp blocks as appropriate.
113*4ead2cf7SAlex Zinenko   template <typename LambdaThen, typename LambdaElse>
114*4ead2cf7SAlex Zinenko   void emitInBounds(ValueRange majorIvs, ValueRange majorOffsets,
115*4ead2cf7SAlex Zinenko                     MemRefBoundsCapture &memrefBounds,
116*4ead2cf7SAlex Zinenko                     LambdaThen thenBlockBuilder, LambdaElse elseBlockBuilder);
117*4ead2cf7SAlex Zinenko 
118*4ead2cf7SAlex Zinenko   /// Common state to lower vector transfer ops.
119*4ead2cf7SAlex Zinenko   PatternRewriter &rewriter;
120*4ead2cf7SAlex Zinenko   Location loc;
121*4ead2cf7SAlex Zinenko   std::unique_ptr<ScopedContext> scope;
122*4ead2cf7SAlex Zinenko   ConcreteOp xferOp;
123*4ead2cf7SAlex Zinenko   Operation *op;
124*4ead2cf7SAlex Zinenko   // A vector transfer copies data between:
125*4ead2cf7SAlex Zinenko   //   - memref<(leading_dims) x (major_dims) x (minor_dims) x type>
126*4ead2cf7SAlex Zinenko   //   - vector<(major_dims) x (minor_dims) x type>
127*4ead2cf7SAlex Zinenko   unsigned minorRank;         // for now always 1
128*4ead2cf7SAlex Zinenko   unsigned majorRank;         // vector rank - minorRank
129*4ead2cf7SAlex Zinenko   unsigned leadingRank;       // memref rank - vector rank
130*4ead2cf7SAlex Zinenko   VectorType vectorType;      // vector<(major_dims) x (minor_dims) x type>
131*4ead2cf7SAlex Zinenko   VectorType majorVectorType; // vector<(major_dims) x type>
132*4ead2cf7SAlex Zinenko   VectorType minorVectorType; // vector<(minor_dims) x type>
133*4ead2cf7SAlex Zinenko   MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>>
134*4ead2cf7SAlex Zinenko };
135*4ead2cf7SAlex Zinenko 
136*4ead2cf7SAlex Zinenko template <typename ConcreteOp>
137*4ead2cf7SAlex Zinenko template <typename Lambda>
138*4ead2cf7SAlex Zinenko void NDTransferOpHelper<ConcreteOp>::emitLoops(Lambda loopBodyBuilder) {
139*4ead2cf7SAlex Zinenko   /// Loop nest operates on the major dimensions
140*4ead2cf7SAlex Zinenko   MemRefBoundsCapture memrefBoundsCapture(xferOp.memref());
141*4ead2cf7SAlex Zinenko   VectorBoundsCapture vectorBoundsCapture(majorVectorType);
142*4ead2cf7SAlex Zinenko   auto majorLbs = vectorBoundsCapture.getLbs();
143*4ead2cf7SAlex Zinenko   auto majorUbs = vectorBoundsCapture.getUbs();
144*4ead2cf7SAlex Zinenko   auto majorSteps = vectorBoundsCapture.getSteps();
145*4ead2cf7SAlex Zinenko   SmallVector<Value, 8> majorIvs(vectorBoundsCapture.rank());
146*4ead2cf7SAlex Zinenko   AffineLoopNestBuilder(majorIvs, majorLbs, majorUbs, majorSteps)([&] {
147*4ead2cf7SAlex Zinenko     ValueRange indices(xferOp.indices());
148*4ead2cf7SAlex Zinenko     loopBodyBuilder(majorIvs, indices.take_front(leadingRank),
149*4ead2cf7SAlex Zinenko                     indices.drop_front(leadingRank).take_front(majorRank),
150*4ead2cf7SAlex Zinenko                     indices.take_back(minorRank), memrefBoundsCapture);
151*4ead2cf7SAlex Zinenko   });
152*4ead2cf7SAlex Zinenko }
153*4ead2cf7SAlex Zinenko 
154*4ead2cf7SAlex Zinenko template <typename ConcreteOp>
155*4ead2cf7SAlex Zinenko template <typename LambdaThen, typename LambdaElse>
156*4ead2cf7SAlex Zinenko void NDTransferOpHelper<ConcreteOp>::emitInBounds(
157*4ead2cf7SAlex Zinenko     ValueRange majorIvs, ValueRange majorOffsets,
158*4ead2cf7SAlex Zinenko     MemRefBoundsCapture &memrefBounds, LambdaThen thenBlockBuilder,
159*4ead2cf7SAlex Zinenko     LambdaElse elseBlockBuilder) {
160*4ead2cf7SAlex Zinenko   Value inBounds = std_constant_int(/*value=*/1, /*width=*/1);
161*4ead2cf7SAlex Zinenko   SmallVector<Value, 4> majorIvsPlusOffsets;
162*4ead2cf7SAlex Zinenko   majorIvsPlusOffsets.reserve(majorIvs.size());
163*4ead2cf7SAlex Zinenko   for (auto it : llvm::zip(majorIvs, majorOffsets, memrefBounds.getUbs())) {
164*4ead2cf7SAlex Zinenko     Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it);
165*4ead2cf7SAlex Zinenko     using namespace mlir::edsc::op;
166*4ead2cf7SAlex Zinenko     majorIvsPlusOffsets.push_back(iv + off);
167*4ead2cf7SAlex Zinenko     Value inBounds2 = majorIvsPlusOffsets.back() < ub;
168*4ead2cf7SAlex Zinenko     inBounds = inBounds && inBounds2;
169*4ead2cf7SAlex Zinenko   }
170*4ead2cf7SAlex Zinenko 
171*4ead2cf7SAlex Zinenko   auto ifOp = ScopedContext::getBuilderRef().create<scf::IfOp>(
172*4ead2cf7SAlex Zinenko       ScopedContext::getLocation(), TypeRange{}, inBounds,
173*4ead2cf7SAlex Zinenko       /*withElseRegion=*/std::is_same<ConcreteOp, TransferReadOp>());
174*4ead2cf7SAlex Zinenko   BlockBuilder(&ifOp.thenRegion().front(),
175*4ead2cf7SAlex Zinenko                Append())([&] { thenBlockBuilder(majorIvsPlusOffsets); });
176*4ead2cf7SAlex Zinenko   if (std::is_same<ConcreteOp, TransferReadOp>())
177*4ead2cf7SAlex Zinenko     BlockBuilder(&ifOp.elseRegion().front(),
178*4ead2cf7SAlex Zinenko                  Append())([&] { elseBlockBuilder(majorIvsPlusOffsets); });
179*4ead2cf7SAlex Zinenko }
180*4ead2cf7SAlex Zinenko 
181*4ead2cf7SAlex Zinenko template <>
182*4ead2cf7SAlex Zinenko LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
183*4ead2cf7SAlex Zinenko   Value alloc = std_alloc(memRefMinorVectorType);
184*4ead2cf7SAlex Zinenko 
185*4ead2cf7SAlex Zinenko   emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
186*4ead2cf7SAlex Zinenko                 ValueRange majorOffsets, ValueRange minorOffsets,
187*4ead2cf7SAlex Zinenko                 MemRefBoundsCapture &memrefBounds) {
188*4ead2cf7SAlex Zinenko     // If in-bounds, index into memref and lower to 1-D transfer read.
189*4ead2cf7SAlex Zinenko     auto thenBlockBuilder = [&](ValueRange majorIvsPlusOffsets) {
190*4ead2cf7SAlex Zinenko       auto map = AffineMap::getMinorIdentityMap(
191*4ead2cf7SAlex Zinenko           xferOp.getMemRefType().getRank(), minorRank, xferOp.getContext());
192*4ead2cf7SAlex Zinenko       // Lower to 1-D vector_transfer_read and let recursion handle it.
193*4ead2cf7SAlex Zinenko       Value memref = xferOp.memref();
194*4ead2cf7SAlex Zinenko       SmallVector<Value, 8> indexing;
195*4ead2cf7SAlex Zinenko       indexing.reserve(leadingRank + majorRank + minorRank);
196*4ead2cf7SAlex Zinenko       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
197*4ead2cf7SAlex Zinenko       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
198*4ead2cf7SAlex Zinenko       indexing.append(minorOffsets.begin(), minorOffsets.end());
199*4ead2cf7SAlex Zinenko       auto loaded1D =
200*4ead2cf7SAlex Zinenko           vector_transfer_read(minorVectorType, memref, indexing,
201*4ead2cf7SAlex Zinenko                                AffineMapAttr::get(map), xferOp.padding());
202*4ead2cf7SAlex Zinenko       // Store the 1-D vector.
203*4ead2cf7SAlex Zinenko       std_store(loaded1D, alloc, majorIvs);
204*4ead2cf7SAlex Zinenko     };
205*4ead2cf7SAlex Zinenko     // If out-of-bounds, just store a splatted vector.
206*4ead2cf7SAlex Zinenko     auto elseBlockBuilder = [&](ValueRange majorIvsPlusOffsets) {
207*4ead2cf7SAlex Zinenko       auto vector = std_splat(minorVectorType, xferOp.padding());
208*4ead2cf7SAlex Zinenko       std_store(vector, alloc, majorIvs);
209*4ead2cf7SAlex Zinenko     };
210*4ead2cf7SAlex Zinenko     emitInBounds(majorIvs, majorOffsets, memrefBounds, thenBlockBuilder,
211*4ead2cf7SAlex Zinenko                  elseBlockBuilder);
212*4ead2cf7SAlex Zinenko   });
213*4ead2cf7SAlex Zinenko 
214*4ead2cf7SAlex Zinenko   Value loaded =
215*4ead2cf7SAlex Zinenko       std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc));
216*4ead2cf7SAlex Zinenko   rewriter.replaceOp(op, loaded);
217*4ead2cf7SAlex Zinenko 
218*4ead2cf7SAlex Zinenko   return success();
219*4ead2cf7SAlex Zinenko }
220*4ead2cf7SAlex Zinenko 
221*4ead2cf7SAlex Zinenko template <>
222*4ead2cf7SAlex Zinenko LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
223*4ead2cf7SAlex Zinenko   Value alloc = std_alloc(memRefMinorVectorType);
224*4ead2cf7SAlex Zinenko 
225*4ead2cf7SAlex Zinenko   std_store(xferOp.vector(),
226*4ead2cf7SAlex Zinenko             vector_type_cast(MemRefType::get({}, vectorType), alloc));
227*4ead2cf7SAlex Zinenko 
228*4ead2cf7SAlex Zinenko   emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
229*4ead2cf7SAlex Zinenko                 ValueRange majorOffsets, ValueRange minorOffsets,
230*4ead2cf7SAlex Zinenko                 MemRefBoundsCapture &memrefBounds) {
231*4ead2cf7SAlex Zinenko     auto thenBlockBuilder = [&](ValueRange majorIvsPlusOffsets) {
232*4ead2cf7SAlex Zinenko       // Lower to 1-D vector_transfer_write and let recursion handle it.
233*4ead2cf7SAlex Zinenko       Value loaded1D = std_load(alloc, majorIvs);
234*4ead2cf7SAlex Zinenko       auto map = AffineMap::getMinorIdentityMap(
235*4ead2cf7SAlex Zinenko           xferOp.getMemRefType().getRank(), minorRank, xferOp.getContext());
236*4ead2cf7SAlex Zinenko       SmallVector<Value, 8> indexing;
237*4ead2cf7SAlex Zinenko       indexing.reserve(leadingRank + majorRank + minorRank);
238*4ead2cf7SAlex Zinenko       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
239*4ead2cf7SAlex Zinenko       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
240*4ead2cf7SAlex Zinenko       indexing.append(minorOffsets.begin(), minorOffsets.end());
241*4ead2cf7SAlex Zinenko       vector_transfer_write(loaded1D, xferOp.memref(), indexing,
242*4ead2cf7SAlex Zinenko                             AffineMapAttr::get(map));
243*4ead2cf7SAlex Zinenko     };
244*4ead2cf7SAlex Zinenko     // Don't write anything when out of bounds.
245*4ead2cf7SAlex Zinenko     auto elseBlockBuilder = [&](ValueRange majorIvsPlusOffsets) {};
246*4ead2cf7SAlex Zinenko     emitInBounds(majorIvs, majorOffsets, memrefBounds, thenBlockBuilder,
247*4ead2cf7SAlex Zinenko                  elseBlockBuilder);
248*4ead2cf7SAlex Zinenko   });
249*4ead2cf7SAlex Zinenko 
250*4ead2cf7SAlex Zinenko   rewriter.eraseOp(op);
251*4ead2cf7SAlex Zinenko 
252*4ead2cf7SAlex Zinenko   return success();
253*4ead2cf7SAlex Zinenko }
254*4ead2cf7SAlex Zinenko 
255*4ead2cf7SAlex Zinenko /// Analyzes the `transfer` to find an access dimension along the fastest remote
256*4ead2cf7SAlex Zinenko /// MemRef dimension. If such a dimension with coalescing properties is found,
257*4ead2cf7SAlex Zinenko /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of
258*4ead2cf7SAlex Zinenko /// LoopNestBuilder captures it in the innermost loop.
259*4ead2cf7SAlex Zinenko template <typename TransferOpTy>
260*4ead2cf7SAlex Zinenko static int computeCoalescedIndex(TransferOpTy transfer) {
261*4ead2cf7SAlex Zinenko   // rank of the remote memory access, coalescing behavior occurs on the
262*4ead2cf7SAlex Zinenko   // innermost memory dimension.
263*4ead2cf7SAlex Zinenko   auto remoteRank = transfer.getMemRefType().getRank();
264*4ead2cf7SAlex Zinenko   // Iterate over the results expressions of the permutation map to determine
265*4ead2cf7SAlex Zinenko   // the loop order for creating pointwise copies between remote and local
266*4ead2cf7SAlex Zinenko   // memories.
267*4ead2cf7SAlex Zinenko   int coalescedIdx = -1;
268*4ead2cf7SAlex Zinenko   auto exprs = transfer.permutation_map().getResults();
269*4ead2cf7SAlex Zinenko   for (auto en : llvm::enumerate(exprs)) {
270*4ead2cf7SAlex Zinenko     auto dim = en.value().template dyn_cast<AffineDimExpr>();
271*4ead2cf7SAlex Zinenko     if (!dim) {
272*4ead2cf7SAlex Zinenko       continue;
273*4ead2cf7SAlex Zinenko     }
274*4ead2cf7SAlex Zinenko     auto memRefDim = dim.getPosition();
275*4ead2cf7SAlex Zinenko     if (memRefDim == remoteRank - 1) {
276*4ead2cf7SAlex Zinenko       // memRefDim has coalescing properties, it should be swapped in the last
277*4ead2cf7SAlex Zinenko       // position.
278*4ead2cf7SAlex Zinenko       assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
279*4ead2cf7SAlex Zinenko       coalescedIdx = en.index();
280*4ead2cf7SAlex Zinenko     }
281*4ead2cf7SAlex Zinenko   }
282*4ead2cf7SAlex Zinenko   return coalescedIdx;
283*4ead2cf7SAlex Zinenko }
284*4ead2cf7SAlex Zinenko 
285*4ead2cf7SAlex Zinenko /// Emits remote memory accesses that are clipped to the boundaries of the
286*4ead2cf7SAlex Zinenko /// MemRef.
287*4ead2cf7SAlex Zinenko template <typename TransferOpTy>
288*4ead2cf7SAlex Zinenko static SmallVector<Value, 8>
289*4ead2cf7SAlex Zinenko clip(TransferOpTy transfer, MemRefBoundsCapture &bounds, ArrayRef<Value> ivs) {
290*4ead2cf7SAlex Zinenko   using namespace mlir::edsc;
291*4ead2cf7SAlex Zinenko 
292*4ead2cf7SAlex Zinenko   Value zero(std_constant_index(0)), one(std_constant_index(1));
293*4ead2cf7SAlex Zinenko   SmallVector<Value, 8> memRefAccess(transfer.indices());
294*4ead2cf7SAlex Zinenko   SmallVector<Value, 8> clippedScalarAccessExprs(memRefAccess.size());
295*4ead2cf7SAlex Zinenko   // Indices accessing to remote memory are clipped and their expressions are
296*4ead2cf7SAlex Zinenko   // returned in clippedScalarAccessExprs.
297*4ead2cf7SAlex Zinenko   for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size();
298*4ead2cf7SAlex Zinenko        ++memRefDim) {
299*4ead2cf7SAlex Zinenko     // Linear search on a small number of entries.
300*4ead2cf7SAlex Zinenko     int loopIndex = -1;
301*4ead2cf7SAlex Zinenko     auto exprs = transfer.permutation_map().getResults();
302*4ead2cf7SAlex Zinenko     for (auto en : llvm::enumerate(exprs)) {
303*4ead2cf7SAlex Zinenko       auto expr = en.value();
304*4ead2cf7SAlex Zinenko       auto dim = expr.template dyn_cast<AffineDimExpr>();
305*4ead2cf7SAlex Zinenko       // Sanity check.
306*4ead2cf7SAlex Zinenko       assert(
307*4ead2cf7SAlex Zinenko           (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) &&
308*4ead2cf7SAlex Zinenko           "Expected dim or 0 in permutationMap");
309*4ead2cf7SAlex Zinenko       if (dim && memRefDim == dim.getPosition()) {
310*4ead2cf7SAlex Zinenko         loopIndex = en.index();
311*4ead2cf7SAlex Zinenko         break;
312*4ead2cf7SAlex Zinenko       }
313*4ead2cf7SAlex Zinenko     }
314*4ead2cf7SAlex Zinenko 
315*4ead2cf7SAlex Zinenko     // We cannot distinguish atm between unrolled dimensions that implement
316*4ead2cf7SAlex Zinenko     // the "always full" tile abstraction and need clipping from the other
317*4ead2cf7SAlex Zinenko     // ones. So we conservatively clip everything.
318*4ead2cf7SAlex Zinenko     using namespace edsc::op;
319*4ead2cf7SAlex Zinenko     auto N = bounds.ub(memRefDim);
320*4ead2cf7SAlex Zinenko     auto i = memRefAccess[memRefDim];
321*4ead2cf7SAlex Zinenko     if (loopIndex < 0) {
322*4ead2cf7SAlex Zinenko       auto N_minus_1 = N - one;
323*4ead2cf7SAlex Zinenko       auto select_1 = std_select(i < N, i, N_minus_1);
324*4ead2cf7SAlex Zinenko       clippedScalarAccessExprs[memRefDim] =
325*4ead2cf7SAlex Zinenko           std_select(i < zero, zero, select_1);
326*4ead2cf7SAlex Zinenko     } else {
327*4ead2cf7SAlex Zinenko       auto ii = ivs[loopIndex];
328*4ead2cf7SAlex Zinenko       auto i_plus_ii = i + ii;
329*4ead2cf7SAlex Zinenko       auto N_minus_1 = N - one;
330*4ead2cf7SAlex Zinenko       auto select_1 = std_select(i_plus_ii < N, i_plus_ii, N_minus_1);
331*4ead2cf7SAlex Zinenko       clippedScalarAccessExprs[memRefDim] =
332*4ead2cf7SAlex Zinenko           std_select(i_plus_ii < zero, zero, select_1);
333*4ead2cf7SAlex Zinenko     }
334*4ead2cf7SAlex Zinenko   }
335*4ead2cf7SAlex Zinenko 
336*4ead2cf7SAlex Zinenko   return clippedScalarAccessExprs;
337*4ead2cf7SAlex Zinenko }
338*4ead2cf7SAlex Zinenko 
339*4ead2cf7SAlex Zinenko namespace {
340*4ead2cf7SAlex Zinenko 
341*4ead2cf7SAlex Zinenko /// Implements lowering of TransferReadOp and TransferWriteOp to a
342*4ead2cf7SAlex Zinenko /// proper abstraction for the hardware.
343*4ead2cf7SAlex Zinenko ///
344*4ead2cf7SAlex Zinenko /// For now, we only emit a simple loop nest that performs clipped pointwise
345*4ead2cf7SAlex Zinenko /// copies from a remote to a locally allocated memory.
346*4ead2cf7SAlex Zinenko ///
347*4ead2cf7SAlex Zinenko /// Consider the case:
348*4ead2cf7SAlex Zinenko ///
349*4ead2cf7SAlex Zinenko /// ```mlir
350*4ead2cf7SAlex Zinenko ///    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into
351*4ead2cf7SAlex Zinenko ///    // vector<32x256xf32> and pad with %f0 to handle the boundary case:
352*4ead2cf7SAlex Zinenko ///    %f0 = constant 0.0f : f32
353*4ead2cf7SAlex Zinenko ///    scf.for %i0 = 0 to %0 {
354*4ead2cf7SAlex Zinenko ///      scf.for %i1 = 0 to %1 step %c256 {
355*4ead2cf7SAlex Zinenko ///        scf.for %i2 = 0 to %2 step %c32 {
356*4ead2cf7SAlex Zinenko ///          %v = vector.transfer_read %A[%i0, %i1, %i2], %f0
357*4ead2cf7SAlex Zinenko ///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
358*4ead2cf7SAlex Zinenko ///               memref<?x?x?xf32>, vector<32x256xf32>
359*4ead2cf7SAlex Zinenko ///    }}}
360*4ead2cf7SAlex Zinenko /// ```
361*4ead2cf7SAlex Zinenko ///
362*4ead2cf7SAlex Zinenko /// The rewriters construct loop and indices that access MemRef A in a pattern
363*4ead2cf7SAlex Zinenko /// resembling the following (while guaranteeing an always full-tile
364*4ead2cf7SAlex Zinenko /// abstraction):
365*4ead2cf7SAlex Zinenko ///
366*4ead2cf7SAlex Zinenko /// ```mlir
367*4ead2cf7SAlex Zinenko ///    scf.for %d2 = 0 to %c256 {
368*4ead2cf7SAlex Zinenko ///      scf.for %d1 = 0 to %c32 {
369*4ead2cf7SAlex Zinenko ///        %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32
370*4ead2cf7SAlex Zinenko ///        %tmp[%d2, %d1] = %s
371*4ead2cf7SAlex Zinenko ///      }
372*4ead2cf7SAlex Zinenko ///    }
373*4ead2cf7SAlex Zinenko /// ```
374*4ead2cf7SAlex Zinenko ///
375*4ead2cf7SAlex Zinenko /// In the current state, only a clipping transfer is implemented by `clip`,
376*4ead2cf7SAlex Zinenko /// which creates individual indexing expressions of the form:
377*4ead2cf7SAlex Zinenko ///
378*4ead2cf7SAlex Zinenko /// ```mlir-dsc
379*4ead2cf7SAlex Zinenko ///    auto condMax = i + ii < N;
380*4ead2cf7SAlex Zinenko ///    auto max = std_select(condMax, i + ii, N - one)
381*4ead2cf7SAlex Zinenko ///    auto cond = i + ii < zero;
382*4ead2cf7SAlex Zinenko ///    std_select(cond, zero, max);
383*4ead2cf7SAlex Zinenko /// ```
384*4ead2cf7SAlex Zinenko ///
385*4ead2cf7SAlex Zinenko /// In the future, clipping should not be the only way and instead we should
386*4ead2cf7SAlex Zinenko /// load vectors + mask them. Similarly on the write side, load/mask/store for
387*4ead2cf7SAlex Zinenko /// implementing RMW behavior.
388*4ead2cf7SAlex Zinenko ///
389*4ead2cf7SAlex Zinenko /// Lowers TransferOp into a combination of:
390*4ead2cf7SAlex Zinenko ///   1. local memory allocation;
391*4ead2cf7SAlex Zinenko ///   2. perfect loop nest over:
392*4ead2cf7SAlex Zinenko ///      a. scalar load/stores from local buffers (viewed as a scalar memref);
393*4ead2cf7SAlex Zinenko ///      a. scalar store/load to original memref (with clipping).
394*4ead2cf7SAlex Zinenko ///   3. vector_load/store
395*4ead2cf7SAlex Zinenko ///   4. local memory deallocation.
396*4ead2cf7SAlex Zinenko /// Minor variations occur depending on whether a TransferReadOp or
397*4ead2cf7SAlex Zinenko /// a TransferWriteOp is rewritten.
398*4ead2cf7SAlex Zinenko template <typename TransferOpTy>
399*4ead2cf7SAlex Zinenko struct VectorTransferRewriter : public RewritePattern {
400*4ead2cf7SAlex Zinenko   explicit VectorTransferRewriter(MLIRContext *context)
401*4ead2cf7SAlex Zinenko       : RewritePattern(TransferOpTy::getOperationName(), 1, context) {}
402*4ead2cf7SAlex Zinenko 
403*4ead2cf7SAlex Zinenko   /// Used for staging the transfer in a local scalar buffer.
404*4ead2cf7SAlex Zinenko   MemRefType tmpMemRefType(TransferOpTy transfer) const {
405*4ead2cf7SAlex Zinenko     auto vectorType = transfer.getVectorType();
406*4ead2cf7SAlex Zinenko     return MemRefType::get(vectorType.getShape(), vectorType.getElementType(),
407*4ead2cf7SAlex Zinenko                            {}, 0);
408*4ead2cf7SAlex Zinenko   }
409*4ead2cf7SAlex Zinenko 
410*4ead2cf7SAlex Zinenko   /// Performs the rewrite.
411*4ead2cf7SAlex Zinenko   LogicalResult matchAndRewrite(Operation *op,
412*4ead2cf7SAlex Zinenko                                 PatternRewriter &rewriter) const override;
413*4ead2cf7SAlex Zinenko };
414*4ead2cf7SAlex Zinenko 
415*4ead2cf7SAlex Zinenko /// Lowers TransferReadOp into a combination of:
416*4ead2cf7SAlex Zinenko ///   1. local memory allocation;
417*4ead2cf7SAlex Zinenko ///   2. perfect loop nest over:
418*4ead2cf7SAlex Zinenko ///      a. scalar load from local buffers (viewed as a scalar memref);
419*4ead2cf7SAlex Zinenko ///      a. scalar store to original memref (with clipping).
420*4ead2cf7SAlex Zinenko ///   3. vector_load from local buffer (viewed as a memref<1 x vector>);
421*4ead2cf7SAlex Zinenko ///   4. local memory deallocation.
422*4ead2cf7SAlex Zinenko ///
423*4ead2cf7SAlex Zinenko /// Lowers the data transfer part of a TransferReadOp while ensuring no
424*4ead2cf7SAlex Zinenko /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
425*4ead2cf7SAlex Zinenko /// clipping. This means that a given value in memory can be read multiple
426*4ead2cf7SAlex Zinenko /// times and concurrently.
427*4ead2cf7SAlex Zinenko ///
428*4ead2cf7SAlex Zinenko /// Important notes about clipping and "full-tiles only" abstraction:
429*4ead2cf7SAlex Zinenko /// =================================================================
430*4ead2cf7SAlex Zinenko /// When using clipping for dealing with boundary conditions, the same edge
431*4ead2cf7SAlex Zinenko /// value will appear multiple times (a.k.a edge padding). This is fine if the
432*4ead2cf7SAlex Zinenko /// subsequent vector operations are all data-parallel but **is generally
433*4ead2cf7SAlex Zinenko /// incorrect** in the presence of reductions or extract operations.
434*4ead2cf7SAlex Zinenko ///
435*4ead2cf7SAlex Zinenko /// More generally, clipping is a scalar abstraction that is expected to work
436*4ead2cf7SAlex Zinenko /// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs.
437*4ead2cf7SAlex Zinenko /// To deal with real vector_load and DMAs, a "padded allocation + view"
438*4ead2cf7SAlex Zinenko /// abstraction with the ability to read out-of-memref-bounds (but still within
439*4ead2cf7SAlex Zinenko /// the allocated region) is necessary.
440*4ead2cf7SAlex Zinenko ///
441*4ead2cf7SAlex Zinenko /// Whether using scalar loops or vector_load/DMAs to perform the transfer,
442*4ead2cf7SAlex Zinenko /// junk values will be materialized in the vectors and generally need to be
443*4ead2cf7SAlex Zinenko /// filtered out and replaced by the "neutral element". This neutral element is
444*4ead2cf7SAlex Zinenko /// op-dependent so, in the future, we expect to create a vector filter and
445*4ead2cf7SAlex Zinenko /// apply it to a splatted constant vector with the proper neutral element at
446*4ead2cf7SAlex Zinenko /// each ssa-use. This filtering is not necessary for pure data-parallel
447*4ead2cf7SAlex Zinenko /// operations.
448*4ead2cf7SAlex Zinenko ///
449*4ead2cf7SAlex Zinenko /// In the case of vector_store/DMAs, Read-Modify-Write will be required, which
450*4ead2cf7SAlex Zinenko /// also have concurrency implications. Note that by using clipped scalar stores
451*4ead2cf7SAlex Zinenko /// in the presence of data-parallel only operations, we generate code that
452*4ead2cf7SAlex Zinenko /// writes the same value multiple time on the edge locations.
453*4ead2cf7SAlex Zinenko ///
454*4ead2cf7SAlex Zinenko /// TODO(ntv): implement alternatives to clipping.
455*4ead2cf7SAlex Zinenko /// TODO(ntv): support non-data-parallel operations.
456*4ead2cf7SAlex Zinenko 
457*4ead2cf7SAlex Zinenko /// Performs the rewrite.
458*4ead2cf7SAlex Zinenko template <>
459*4ead2cf7SAlex Zinenko LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite(
460*4ead2cf7SAlex Zinenko     Operation *op, PatternRewriter &rewriter) const {
461*4ead2cf7SAlex Zinenko   using namespace mlir::edsc::op;
462*4ead2cf7SAlex Zinenko 
463*4ead2cf7SAlex Zinenko   TransferReadOp transfer = cast<TransferReadOp>(op);
464*4ead2cf7SAlex Zinenko   if (AffineMap::isMinorIdentity(transfer.permutation_map())) {
465*4ead2cf7SAlex Zinenko     // If > 1D, emit a bunch of loops around 1-D vector transfers.
466*4ead2cf7SAlex Zinenko     if (transfer.getVectorType().getRank() > 1)
467*4ead2cf7SAlex Zinenko       return NDTransferOpHelper<TransferReadOp>(rewriter, transfer).doReplace();
468*4ead2cf7SAlex Zinenko     // If 1-D this is now handled by the target-specific lowering.
469*4ead2cf7SAlex Zinenko     if (transfer.getVectorType().getRank() == 1)
470*4ead2cf7SAlex Zinenko       return failure();
471*4ead2cf7SAlex Zinenko   }
472*4ead2cf7SAlex Zinenko 
473*4ead2cf7SAlex Zinenko   // Conservative lowering to scalar load / stores.
474*4ead2cf7SAlex Zinenko   // 1. Setup all the captures.
475*4ead2cf7SAlex Zinenko   ScopedContext scope(rewriter, transfer.getLoc());
476*4ead2cf7SAlex Zinenko   StdIndexedValue remote(transfer.memref());
477*4ead2cf7SAlex Zinenko   MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
478*4ead2cf7SAlex Zinenko   VectorBoundsCapture vectorBoundsCapture(transfer.vector());
479*4ead2cf7SAlex Zinenko   int coalescedIdx = computeCoalescedIndex(transfer);
480*4ead2cf7SAlex Zinenko   // Swap the vectorBoundsCapture which will reorder loop bounds.
481*4ead2cf7SAlex Zinenko   if (coalescedIdx >= 0)
482*4ead2cf7SAlex Zinenko     vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
483*4ead2cf7SAlex Zinenko                                    coalescedIdx);
484*4ead2cf7SAlex Zinenko 
485*4ead2cf7SAlex Zinenko   auto lbs = vectorBoundsCapture.getLbs();
486*4ead2cf7SAlex Zinenko   auto ubs = vectorBoundsCapture.getUbs();
487*4ead2cf7SAlex Zinenko   SmallVector<Value, 8> steps;
488*4ead2cf7SAlex Zinenko   steps.reserve(vectorBoundsCapture.getSteps().size());
489*4ead2cf7SAlex Zinenko   for (auto step : vectorBoundsCapture.getSteps())
490*4ead2cf7SAlex Zinenko     steps.push_back(std_constant_index(step));
491*4ead2cf7SAlex Zinenko 
492*4ead2cf7SAlex Zinenko   // 2. Emit alloc-copy-load-dealloc.
493*4ead2cf7SAlex Zinenko   Value tmp = std_alloc(tmpMemRefType(transfer));
494*4ead2cf7SAlex Zinenko   StdIndexedValue local(tmp);
495*4ead2cf7SAlex Zinenko   Value vec = vector_type_cast(tmp);
496*4ead2cf7SAlex Zinenko   SmallVector<Value, 8> ivs(lbs.size());
497*4ead2cf7SAlex Zinenko   LoopNestBuilder(ivs, lbs, ubs, steps)([&] {
498*4ead2cf7SAlex Zinenko     // Swap the ivs which will reorder memory accesses.
499*4ead2cf7SAlex Zinenko     if (coalescedIdx >= 0)
500*4ead2cf7SAlex Zinenko       std::swap(ivs.back(), ivs[coalescedIdx]);
501*4ead2cf7SAlex Zinenko     // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
502*4ead2cf7SAlex Zinenko     local(ivs) = remote(clip(transfer, memRefBoundsCapture, ivs));
503*4ead2cf7SAlex Zinenko   });
504*4ead2cf7SAlex Zinenko   Value vectorValue = std_load(vec);
505*4ead2cf7SAlex Zinenko   (std_dealloc(tmp)); // vexing parse
506*4ead2cf7SAlex Zinenko 
507*4ead2cf7SAlex Zinenko   // 3. Propagate.
508*4ead2cf7SAlex Zinenko   rewriter.replaceOp(op, vectorValue);
509*4ead2cf7SAlex Zinenko   return success();
510*4ead2cf7SAlex Zinenko }
511*4ead2cf7SAlex Zinenko 
512*4ead2cf7SAlex Zinenko /// Lowers TransferWriteOp into a combination of:
513*4ead2cf7SAlex Zinenko ///   1. local memory allocation;
514*4ead2cf7SAlex Zinenko ///   2. vector_store to local buffer (viewed as a memref<1 x vector>);
515*4ead2cf7SAlex Zinenko ///   3. perfect loop nest over:
516*4ead2cf7SAlex Zinenko ///      a. scalar load from local buffers (viewed as a scalar memref);
517*4ead2cf7SAlex Zinenko ///      a. scalar store to original memref (with clipping).
518*4ead2cf7SAlex Zinenko ///   4. local memory deallocation.
519*4ead2cf7SAlex Zinenko ///
520*4ead2cf7SAlex Zinenko /// More specifically, lowers the data transfer part while ensuring no
521*4ead2cf7SAlex Zinenko /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
522*4ead2cf7SAlex Zinenko /// clipping. This means that a given value in memory can be written to multiple
523*4ead2cf7SAlex Zinenko /// times and concurrently.
524*4ead2cf7SAlex Zinenko ///
525*4ead2cf7SAlex Zinenko /// See `Important notes about clipping and full-tiles only abstraction` in the
526*4ead2cf7SAlex Zinenko /// description of `readClipped` above.
527*4ead2cf7SAlex Zinenko ///
528*4ead2cf7SAlex Zinenko /// TODO(ntv): implement alternatives to clipping.
529*4ead2cf7SAlex Zinenko /// TODO(ntv): support non-data-parallel operations.
530*4ead2cf7SAlex Zinenko template <>
531*4ead2cf7SAlex Zinenko LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite(
532*4ead2cf7SAlex Zinenko     Operation *op, PatternRewriter &rewriter) const {
533*4ead2cf7SAlex Zinenko   using namespace edsc::op;
534*4ead2cf7SAlex Zinenko 
535*4ead2cf7SAlex Zinenko   TransferWriteOp transfer = cast<TransferWriteOp>(op);
536*4ead2cf7SAlex Zinenko   if (AffineMap::isMinorIdentity(transfer.permutation_map())) {
537*4ead2cf7SAlex Zinenko     // If > 1D, emit a bunch of loops around 1-D vector transfers.
538*4ead2cf7SAlex Zinenko     if (transfer.getVectorType().getRank() > 1)
539*4ead2cf7SAlex Zinenko       return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer)
540*4ead2cf7SAlex Zinenko           .doReplace();
541*4ead2cf7SAlex Zinenko     // If 1-D this is now handled by the target-specific lowering.
542*4ead2cf7SAlex Zinenko     if (transfer.getVectorType().getRank() == 1)
543*4ead2cf7SAlex Zinenko       return failure();
544*4ead2cf7SAlex Zinenko   }
545*4ead2cf7SAlex Zinenko 
546*4ead2cf7SAlex Zinenko   // 1. Setup all the captures.
547*4ead2cf7SAlex Zinenko   ScopedContext scope(rewriter, transfer.getLoc());
548*4ead2cf7SAlex Zinenko   StdIndexedValue remote(transfer.memref());
549*4ead2cf7SAlex Zinenko   MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
550*4ead2cf7SAlex Zinenko   Value vectorValue(transfer.vector());
551*4ead2cf7SAlex Zinenko   VectorBoundsCapture vectorBoundsCapture(transfer.vector());
552*4ead2cf7SAlex Zinenko   int coalescedIdx = computeCoalescedIndex(transfer);
553*4ead2cf7SAlex Zinenko   // Swap the vectorBoundsCapture which will reorder loop bounds.
554*4ead2cf7SAlex Zinenko   if (coalescedIdx >= 0)
555*4ead2cf7SAlex Zinenko     vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
556*4ead2cf7SAlex Zinenko                                    coalescedIdx);
557*4ead2cf7SAlex Zinenko 
558*4ead2cf7SAlex Zinenko   auto lbs = vectorBoundsCapture.getLbs();
559*4ead2cf7SAlex Zinenko   auto ubs = vectorBoundsCapture.getUbs();
560*4ead2cf7SAlex Zinenko   SmallVector<Value, 8> steps;
561*4ead2cf7SAlex Zinenko   steps.reserve(vectorBoundsCapture.getSteps().size());
562*4ead2cf7SAlex Zinenko   for (auto step : vectorBoundsCapture.getSteps())
563*4ead2cf7SAlex Zinenko     steps.push_back(std_constant_index(step));
564*4ead2cf7SAlex Zinenko 
565*4ead2cf7SAlex Zinenko   // 2. Emit alloc-store-copy-dealloc.
566*4ead2cf7SAlex Zinenko   Value tmp = std_alloc(tmpMemRefType(transfer));
567*4ead2cf7SAlex Zinenko   StdIndexedValue local(tmp);
568*4ead2cf7SAlex Zinenko   Value vec = vector_type_cast(tmp);
569*4ead2cf7SAlex Zinenko   std_store(vectorValue, vec);
570*4ead2cf7SAlex Zinenko   SmallVector<Value, 8> ivs(lbs.size());
571*4ead2cf7SAlex Zinenko   LoopNestBuilder(ivs, lbs, ubs, steps)([&] {
572*4ead2cf7SAlex Zinenko     // Swap the ivs which will reorder memory accesses.
573*4ead2cf7SAlex Zinenko     if (coalescedIdx >= 0)
574*4ead2cf7SAlex Zinenko       std::swap(ivs.back(), ivs[coalescedIdx]);
575*4ead2cf7SAlex Zinenko     // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
576*4ead2cf7SAlex Zinenko     remote(clip(transfer, memRefBoundsCapture, ivs)) = local(ivs);
577*4ead2cf7SAlex Zinenko   });
578*4ead2cf7SAlex Zinenko   (std_dealloc(tmp)); // vexing parse...
579*4ead2cf7SAlex Zinenko 
580*4ead2cf7SAlex Zinenko   rewriter.eraseOp(op);
581*4ead2cf7SAlex Zinenko   return success();
582*4ead2cf7SAlex Zinenko }
583*4ead2cf7SAlex Zinenko 
584*4ead2cf7SAlex Zinenko } // namespace
585*4ead2cf7SAlex Zinenko 
586*4ead2cf7SAlex Zinenko void mlir::populateVectorToSCFConversionPatterns(
587*4ead2cf7SAlex Zinenko     OwningRewritePatternList &patterns, MLIRContext *context) {
588*4ead2cf7SAlex Zinenko   patterns.insert<VectorTransferRewriter<vector::TransferReadOp>,
589*4ead2cf7SAlex Zinenko                   VectorTransferRewriter<vector::TransferWriteOp>>(context);
590*4ead2cf7SAlex Zinenko }
591