1 //===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements lowering of vector transfer operations to SCF.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <type_traits>
14 
15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
16 
17 #include "../PassDetail.h"
18 #include "mlir/Dialect/Affine/IR/AffineOps.h"
19 #include "mlir/Dialect/Affine/Utils.h"
20 #include "mlir/Dialect/SCF/SCF.h"
21 #include "mlir/Dialect/Vector/VectorOps.h"
22 #include "mlir/Dialect/Vector/VectorUtils.h"
23 #include "mlir/IR/Builders.h"
24 #include "mlir/IR/ImplicitLocOpBuilder.h"
25 #include "mlir/Pass/Pass.h"
26 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
27 #include "mlir/Transforms/Passes.h"
28 
29 using namespace mlir;
30 using vector::TransferReadOp;
31 using vector::TransferWriteOp;
32 
33 namespace {
34 
35 /// Attribute name used for labeling transfer ops during progressive lowering.
36 static const char kPassLabel[] = "__vector_to_scf_lowering__";
37 
38 /// Patterns that inherit from this struct have access to
39 /// VectorTransferToSCFOptions.
40 template <typename OpTy>
41 struct VectorToSCFPattern : public OpRewritePattern<OpTy> {
42   explicit VectorToSCFPattern(MLIRContext *context,
43                               VectorTransferToSCFOptions opt)
44       : OpRewritePattern<OpTy>(context), options(opt) {}
45 
46   VectorTransferToSCFOptions options;
47 };
48 
49 /// Given a vector transfer op, calculate which dimension of the `source`
50 /// memref should be unpacked in the next application of TransferOpConversion.
51 /// A return value of None indicates a broadcast.
52 template <typename OpTy>
53 static Optional<int64_t> unpackedDim(OpTy xferOp) {
54   auto map = xferOp.permutation_map();
55   if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
56     return expr.getPosition();
57   }
58   assert(xferOp.isBroadcastDim(0) &&
59          "Expected AffineDimExpr or AffineConstantExpr");
60   return None;
61 }
62 
63 /// Compute the permutation map for the new (N-1)-D vector transfer op. This
64 /// map is identical to the current permutation map, but the first result is
65 /// omitted.
66 template <typename OpTy>
67 static AffineMap unpackedPermutationMap(OpBuilder &b, OpTy xferOp) {
68   auto map = xferOp.permutation_map();
69   return AffineMap::get(map.getNumDims(), 0, map.getResults().drop_front(),
70                         b.getContext());
71 }
72 
73 /// Calculate the indices for the new vector transfer op.
74 ///
75 /// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ...
76 ///       --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32>
77 ///                                 ^^^^^^
78 ///              `iv` is the iteration variable of the (new) surrounding loop.
79 template <typename OpTy>
80 static void getXferIndices(OpBuilder &b, OpTy xferOp, Value iv,
81                            SmallVector<Value, 8> &indices) {
82   typename OpTy::Adaptor adaptor(xferOp);
83   // Corresponding memref dim of the vector dim that is unpacked.
84   auto dim = unpackedDim(xferOp);
85   auto prevIndices = adaptor.indices();
86   indices.append(prevIndices.begin(), prevIndices.end());
87 
88   Location loc = xferOp.getLoc();
89   bool isBroadcast = !dim.hasValue();
90   if (!isBroadcast) {
91     AffineExpr d0, d1;
92     bindDims(xferOp.getContext(), d0, d1);
93     Value offset = adaptor.indices()[dim.getValue()];
94     indices[dim.getValue()] =
95         makeComposedAffineApply(b, loc, d0 + d1, {offset, iv});
96   }
97 }
98 
99 static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal,
100                             Value value) {
101   if (hasRetVal) {
102     b.create<scf::YieldOp>(loc, value);
103   } else {
104     b.create<scf::YieldOp>(loc);
105   }
106 }
107 
108 /// Generates a boolean Value that is true if the iv-th bit in xferOp's mask
109 /// is set to true. No such check is generated under following circumstances:
110 /// * xferOp does not have a mask.
111 /// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is
112 ///   computed and attached to the new transfer op in the pattern.)
113 /// * The to-be-unpacked dim of xferOp is a broadcast.
114 template <typename OpTy>
115 static Value generateMaskCheck(OpBuilder &b, OpTy xferOp, Value iv) {
116   if (!xferOp.mask())
117     return Value();
118   if (xferOp.getMaskType().getRank() != 1)
119     return Value();
120   if (xferOp.isBroadcastDim(0))
121     return Value();
122 
123   Location loc = xferOp.getLoc();
124   Value ivI32 =
125       b.create<IndexCastOp>(loc, IntegerType::get(b.getContext(), 32), iv);
126   return b.create<vector::ExtractElementOp>(loc, xferOp.mask(), ivI32);
127 }
128 
129 /// Helper function TransferOpConversion and TransferOp1dConversion.
130 /// Generate an in-bounds check if the transfer op may go out-of-bounds on the
131 /// specified dimension `dim` with the loop iteration variable `iv`.
132 /// E.g., when unpacking dimension 0 from:
133 /// ```
134 /// %vec = vector.transfer_read %A[%a, %b] %cst
135 ///     : vector<5x4xf32>, memref<?x?xf32>
136 /// ```
137 /// An if check similar to this will be generated inside the loop:
138 /// ```
139 /// %d = memref.dim %A, %c0 : memref<?x?xf32>
140 /// if (%a + iv < %d) {
141 ///   (in-bounds case)
142 /// } else {
143 ///   (out-of-bounds case)
144 /// }
145 /// ```
146 ///
147 /// If the transfer is 1D and has a mask, this function generates a more complex
148 /// check also accounts for potentially masked out elements.
149 ///
150 /// This function variant returns the value returned by `inBoundsCase` or
151 /// `outOfBoundsCase`. The MLIR type of the return value must be specified in
152 /// `resultTypes`.
153 template <typename OpTy>
154 static Value generateInBoundsCheck(
155     OpBuilder &b, OpTy xferOp, Value iv, Optional<int64_t> dim,
156     TypeRange resultTypes,
157     function_ref<Value(OpBuilder &, Location)> inBoundsCase,
158     function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
159   bool hasRetVal = !resultTypes.empty();
160   Value cond; // Condition to be built...
161 
162   // Condition check 1: Access in-bounds?
163   bool isBroadcast = !dim.hasValue(); // No in-bounds check for broadcasts.
164   Location loc = xferOp.getLoc();
165   ImplicitLocOpBuilder lb(xferOp.getLoc(), b);
166   if (!xferOp.isDimInBounds(0) && !isBroadcast) {
167     Value memrefDim = lb.create<memref::DimOp>(xferOp.source(), *dim);
168     AffineExpr d0, d1;
169     bindDims(xferOp.getContext(), d0, d1);
170     Value base = xferOp.indices()[dim.getValue()];
171     Value memrefIdx = makeComposedAffineApply(b, loc, d0 + d1, {base, iv});
172     cond = lb.create<CmpIOp>(CmpIPredicate::sgt, memrefDim, memrefIdx);
173   }
174 
175   // Condition check 2: Masked in?
176   if (auto maskCond = generateMaskCheck(b, xferOp, iv)) {
177     if (cond)
178       cond = lb.create<AndOp>(cond, maskCond);
179     else
180       cond = maskCond;
181   }
182 
183   // If the condition is non-empty, generate an SCF::IfOp.
184   if (cond) {
185     auto check = lb.create<scf::IfOp>(
186         resultTypes, cond,
187         /*thenBuilder=*/
188         [&](OpBuilder &b, Location loc) {
189           maybeYieldValue(b, loc, hasRetVal, inBoundsCase(b, loc));
190         },
191         /*elseBuilder=*/
192         [&](OpBuilder &b, Location loc) {
193           if (outOfBoundsCase) {
194             maybeYieldValue(b, loc, hasRetVal, outOfBoundsCase(b, loc));
195           } else {
196             b.create<scf::YieldOp>(loc);
197           }
198         });
199 
200     return hasRetVal ? check.getResult(0) : Value();
201   }
202 
203   // Condition is empty, no need for an SCF::IfOp.
204   return inBoundsCase(b, loc);
205 }
206 
207 /// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have
208 /// a return value. Consequently, this function does not have a return value.
209 template <typename OpTy>
210 static void generateInBoundsCheck(
211     OpBuilder &b, OpTy xferOp, Value iv, Optional<int64_t> dim,
212     function_ref<void(OpBuilder &, Location)> inBoundsCase,
213     function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
214   generateInBoundsCheck(
215       b, xferOp, iv, dim, /*resultTypes=*/TypeRange(),
216       /*inBoundsCase=*/
217       [&](OpBuilder &b, Location loc) {
218         inBoundsCase(b, loc);
219         return Value();
220       },
221       /*outOfBoundsCase=*/
222       [&](OpBuilder &b, Location loc) {
223         if (outOfBoundsCase)
224           outOfBoundsCase(b, loc);
225         return Value();
226       });
227 }
228 
229 /// Given an ArrayAttr, return a copy where the first element is dropped.
230 static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) {
231   if (!attr)
232     return attr;
233   return ArrayAttr::get(b.getContext(), attr.getValue().drop_front());
234 }
235 
236 /// Add the pass label to a vector transfer op if its rank is not the target
237 /// rank.
238 template <typename OpTy>
239 static void maybeApplyPassLabel(OpBuilder &b, OpTy newXferOp,
240                                 unsigned targetRank) {
241   if (newXferOp.getVectorType().getRank() > targetRank)
242     newXferOp->setAttr(kPassLabel, b.getUnitAttr());
243 }
244 
245 namespace lowering_n_d {
246 
247 /// Helper data structure for data and mask buffers.
248 struct BufferAllocs {
249   Value dataBuffer;
250   Value maskBuffer;
251 };
252 
253 /// Allocate temporary buffers for data (vector) and mask (if present).
254 /// TODO: Parallelism and threadlocal considerations.
255 template <typename OpTy>
256 static BufferAllocs allocBuffers(OpBuilder &b, OpTy xferOp) {
257   Location loc = xferOp.getLoc();
258   OpBuilder::InsertionGuard guard(b);
259   Operation *scope =
260       xferOp->template getParentWithTrait<OpTrait::AutomaticAllocationScope>();
261   assert(scope && "Expected op to be inside automatic allocation scope");
262   b.setInsertionPointToStart(&scope->getRegion(0).front());
263 
264   BufferAllocs result;
265   auto bufferType = MemRefType::get({}, xferOp.getVectorType());
266   result.dataBuffer = b.create<memref::AllocaOp>(loc, bufferType);
267 
268   if (xferOp.mask()) {
269     auto maskType = MemRefType::get({}, xferOp.mask().getType());
270     auto maskBuffer = b.create<memref::AllocaOp>(loc, maskType);
271     b.setInsertionPoint(xferOp);
272     b.create<memref::StoreOp>(loc, xferOp.mask(), maskBuffer);
273     result.maskBuffer = b.create<memref::LoadOp>(loc, maskBuffer);
274   }
275 
276   return result;
277 }
278 
279 /// Given a MemRefType with VectorType element type, unpack one dimension from
280 /// the VectorType into the MemRefType.
281 ///
282 /// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>>
283 static MemRefType unpackOneDim(MemRefType type) {
284   auto vectorType = type.getElementType().dyn_cast<VectorType>();
285   auto memrefShape = type.getShape();
286   SmallVector<int64_t, 8> newMemrefShape;
287   newMemrefShape.append(memrefShape.begin(), memrefShape.end());
288   newMemrefShape.push_back(vectorType.getDimSize(0));
289   return MemRefType::get(newMemrefShape,
290                          VectorType::get(vectorType.getShape().drop_front(),
291                                          vectorType.getElementType()));
292 }
293 
294 /// Given a transfer op, find the memref from which the mask is loaded. This
295 /// is similar to Strategy<TransferWriteOp>::getBuffer.
296 template <typename OpTy>
297 static Value getMaskBuffer(OpTy xferOp) {
298   assert(xferOp.mask() && "Expected that transfer op has mask");
299   auto loadOp = xferOp.mask().template getDefiningOp<memref::LoadOp>();
300   assert(loadOp && "Expected transfer op mask produced by LoadOp");
301   return loadOp.getMemRef();
302 }
303 
304 /// Codegen strategy, depending on the operation.
305 template <typename OpTy>
306 struct Strategy;
307 
308 /// Code strategy for vector TransferReadOp.
309 template <>
310 struct Strategy<TransferReadOp> {
311   /// Find the StoreOp that is used for writing the current TransferReadOp's
312   /// result to the temporary buffer allocation.
313   static memref::StoreOp getStoreOp(TransferReadOp xferOp) {
314     assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp");
315     auto storeOp = dyn_cast<memref::StoreOp>((*xferOp->use_begin()).getOwner());
316     assert(storeOp && "Expected TransferReadOp result used by StoreOp");
317     return storeOp;
318   }
319 
320   /// Find the temporary buffer allocation. All labeled TransferReadOps are
321   /// used like this, where %buf is either the buffer allocation or a type cast
322   /// of the buffer allocation:
323   /// ```
324   /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ...
325   /// memref.store %vec, %buf[...] ...
326   /// ```
327   static Value getBuffer(TransferReadOp xferOp) {
328     return getStoreOp(xferOp).getMemRef();
329   }
330 
331   /// Retrieve the indices of the current StoreOp that stores into the buffer.
332   static void getBufferIndices(TransferReadOp xferOp,
333                                SmallVector<Value, 8> &indices) {
334     auto storeOp = getStoreOp(xferOp);
335     auto prevIndices = memref::StoreOpAdaptor(storeOp).indices();
336     indices.append(prevIndices.begin(), prevIndices.end());
337   }
338 
339   /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds
340   /// accesses on the to-be-unpacked dimension.
341   ///
342   /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration
343   ///    variable `iv`.
344   /// 2. Store the result into the (already `vector.type_cast`ed) buffer.
345   ///
346   /// E.g.:
347   /// ```
348   /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst
349   ///     : memref<?x?x?xf32>, vector<4x3xf32>
350   /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>>
351   /// ```
352   /// Is rewritten to:
353   /// ```
354   /// %casted = vector.type_cast %buf
355   ///     : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
356   /// for %j = 0 to 4 {
357   ///   %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst
358   ///       : memref<?x?x?xf32>, vector<3xf32>
359   ///   memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>>
360   /// }
361   /// ```
362   ///
363   /// Note: The loop and type cast are generated in TransferOpConversion.
364   ///       The original TransferReadOp and store op are deleted in `cleanup`.
365   /// Note: The `mask` operand is set in TransferOpConversion.
366   static TransferReadOp rewriteOp(OpBuilder &b,
367                                   VectorTransferToSCFOptions options,
368                                   TransferReadOp xferOp, Value buffer,
369                                   Value iv) {
370     SmallVector<Value, 8> storeIndices;
371     getBufferIndices(xferOp, storeIndices);
372     storeIndices.push_back(iv);
373 
374     SmallVector<Value, 8> xferIndices;
375     getXferIndices(b, xferOp, iv, xferIndices);
376 
377     Location loc = xferOp.getLoc();
378     auto bufferType = buffer.getType().dyn_cast<ShapedType>();
379     auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
380     auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
381     auto newXferOp = b.create<vector::TransferReadOp>(
382         loc, vecType, xferOp.source(), xferIndices,
383         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), xferOp.padding(),
384         Value(), inBoundsAttr);
385 
386     maybeApplyPassLabel(b, newXferOp, options.targetRank);
387 
388     b.create<memref::StoreOp>(loc, newXferOp.vector(), buffer, storeIndices);
389     return newXferOp;
390   }
391 
392   /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write
393   /// padding value to the temporary buffer.
394   static void handleOutOfBoundsDim(OpBuilder &b, TransferReadOp xferOp,
395                                    Value buffer, Value iv) {
396     SmallVector<Value, 8> storeIndices;
397     getBufferIndices(xferOp, storeIndices);
398     storeIndices.push_back(iv);
399 
400     Location loc = xferOp.getLoc();
401     auto bufferType = buffer.getType().dyn_cast<ShapedType>();
402     auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
403     auto vec = b.create<SplatOp>(loc, vecType, xferOp.padding());
404     b.create<memref::StoreOp>(loc, vec, buffer, storeIndices);
405   }
406 
407   /// Cleanup after rewriting the op.
408   static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp) {
409     rewriter.eraseOp(getStoreOp(xferOp));
410     rewriter.eraseOp(xferOp);
411   }
412 };
413 
414 /// Codegen strategy for vector TransferWriteOp.
415 template <>
416 struct Strategy<TransferWriteOp> {
417   /// Find the temporary buffer allocation. All labeled TransferWriteOps are
418   /// used like this, where %buf is either the buffer allocation or a type cast
419   /// of the buffer allocation:
420   /// ```
421   /// %vec = memref.load %buf[...] ...
422   /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ...
423   /// ```
424   static Value getBuffer(TransferWriteOp xferOp) {
425     auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
426     assert(loadOp && "Expected transfer op vector produced by LoadOp");
427     return loadOp.getMemRef();
428   }
429 
430   /// Retrieve the indices of the current LoadOp that loads from the buffer.
431   static void getBufferIndices(TransferWriteOp xferOp,
432                                SmallVector<Value, 8> &indices) {
433     auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
434     auto prevIndices = memref::LoadOpAdaptor(loadOp).indices();
435     indices.append(prevIndices.begin(), prevIndices.end());
436   }
437 
438   /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds
439   /// accesses on the to-be-unpacked dimension.
440   ///
441   /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer,
442   ///    using the loop iteration variable `iv`.
443   /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back
444   ///    to memory.
445   ///
446   /// Note: For more details, see comments on Strategy<TransferReadOp>.
447   static TransferWriteOp rewriteOp(OpBuilder &b,
448                                    VectorTransferToSCFOptions options,
449                                    TransferWriteOp xferOp, Value buffer,
450                                    Value iv) {
451     SmallVector<Value, 8> loadIndices;
452     getBufferIndices(xferOp, loadIndices);
453     loadIndices.push_back(iv);
454 
455     SmallVector<Value, 8> xferIndices;
456     getXferIndices(b, xferOp, iv, xferIndices);
457 
458     Location loc = xferOp.getLoc();
459     auto vec = b.create<memref::LoadOp>(loc, buffer, loadIndices);
460     auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
461     auto newXferOp = b.create<vector::TransferWriteOp>(
462         loc, Type(), vec, xferOp.source(), xferIndices,
463         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
464         inBoundsAttr);
465 
466     maybeApplyPassLabel(b, newXferOp, options.targetRank);
467 
468     return newXferOp;
469   }
470 
471   /// Handle out-of-bounds accesses on the to-be-unpacked dimension.
472   static void handleOutOfBoundsDim(OpBuilder &b, TransferWriteOp xferOp,
473                                    Value buffer, Value iv) {}
474 
475   /// Cleanup after rewriting the op.
476   static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp) {
477     rewriter.eraseOp(xferOp);
478   }
479 };
480 
481 template <typename OpTy>
482 LogicalResult checkPrepareXferOp(OpTy xferOp,
483                                  VectorTransferToSCFOptions options) {
484   if (xferOp->hasAttr(kPassLabel))
485     return failure();
486   if (xferOp.getVectorType().getRank() <= options.targetRank)
487     return failure();
488   if (xferOp.getShapedType().template isa<RankedTensorType>())
489     return failure();
490   return success();
491 }
492 
493 /// Prepare a TransferReadOp for progressive lowering.
494 ///
495 /// 1. Allocate a temporary buffer.
496 /// 2. Label the TransferReadOp, marking it eligible for progressive lowering.
497 /// 3. Store the result of the TransferReadOp into the temporary buffer.
498 /// 4. Load the result from the temporary buffer and replace all uses of the
499 ///    original TransferReadOp with this load.
500 ///
501 /// E.g.:
502 /// ```
503 /// %vec = vector.transfer_read %A[%a, %b, %c], %cst
504 ///     : vector<5x4xf32>, memref<?x?x?xf32>
505 /// ```
506 /// is rewritten to:
507 /// ```
508 /// %0 = memref.alloca() : memref<vector<5x4xf32>>
509 /// %1 = vector.transfer_read %A[%a, %b, %c], %cst
510 ///     { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32>
511 /// memref.store %1, %0[] : memref<vector<5x4xf32>>
512 /// %vec = memref.load %0[] : memref<vector<5x4xf32>>
513 /// ```
514 ///
515 /// Note: A second temporary buffer may be allocated for the `mask` operand.
516 struct PrepareTransferReadConversion
517     : public VectorToSCFPattern<TransferReadOp> {
518   using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern;
519 
520   LogicalResult matchAndRewrite(TransferReadOp xferOp,
521                                 PatternRewriter &rewriter) const override {
522     if (checkPrepareXferOp(xferOp, options).failed())
523       return failure();
524 
525     auto buffers = allocBuffers(rewriter, xferOp);
526     auto *newXfer = rewriter.clone(*xferOp.getOperation());
527     newXfer->setAttr(kPassLabel, rewriter.getUnitAttr());
528     if (xferOp.mask()) {
529       dyn_cast<TransferReadOp>(newXfer).maskMutable().assign(
530           buffers.maskBuffer);
531     }
532 
533     Location loc = xferOp.getLoc();
534     rewriter.create<memref::StoreOp>(loc, newXfer->getResult(0),
535                                      buffers.dataBuffer);
536     rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);
537 
538     return success();
539   }
540 };
541 
542 /// Prepare a TransferWriteOp for progressive lowering.
543 ///
544 /// 1. Allocate a temporary buffer.
545 /// 2. Store the vector into the buffer.
546 /// 3. Load the vector from the buffer again.
547 /// 4. Use the loaded vector as a TransferWriteOp operand and label the op,
548 ///    marking it eligible for progressive lowering via TransferOpConversion.
549 ///
550 /// E.g.:
551 /// ```
552 /// vector.transfer_write %vec, %A[%a, %b, %c]
553 ///     : vector<5x4xf32>, memref<?x?x?xf32>
554 /// ```
555 /// is rewritten to:
556 /// ```
557 /// %0 = memref.alloca() : memref<vector<5x4xf32>>
558 /// memref.store %vec, %0[] : memref<vector<5x4xf32>>
559 /// %1 = memref.load %0[] : memref<vector<5x4xf32>>
560 /// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
561 ///     : vector<5x4xf32>, memref<?x?x?xf32>
562 /// ```
563 ///
564 /// Note: A second temporary buffer may be allocated for the `mask` operand.
565 struct PrepareTransferWriteConversion
566     : public VectorToSCFPattern<TransferWriteOp> {
567   using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern;
568 
569   LogicalResult matchAndRewrite(TransferWriteOp xferOp,
570                                 PatternRewriter &rewriter) const override {
571     if (checkPrepareXferOp(xferOp, options).failed())
572       return failure();
573 
574     Location loc = xferOp.getLoc();
575     auto buffers = allocBuffers(rewriter, xferOp);
576     rewriter.create<memref::StoreOp>(loc, xferOp.vector(), buffers.dataBuffer);
577     auto loadedVec = rewriter.create<memref::LoadOp>(loc, buffers.dataBuffer);
578     rewriter.updateRootInPlace(xferOp, [&]() {
579       xferOp.vectorMutable().assign(loadedVec);
580       xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
581     });
582 
583     if (xferOp.mask()) {
584       rewriter.updateRootInPlace(
585           xferOp, [&]() { xferOp.maskMutable().assign(buffers.maskBuffer); });
586     }
587 
588     return success();
589   }
590 };
591 
592 /// Progressive lowering of vector transfer ops: Unpack one dimension.
593 ///
594 /// 1. Unpack one dimension from the current buffer type and cast the buffer
595 ///    to that new type. E.g.:
596 ///    ```
597 ///    %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>>
598 ///    vector.transfer_write %vec ...
599 ///    ```
600 ///    The following cast is generated:
601 ///    ```
602 ///    %casted = vector.type_cast %0
603 ///        : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
604 ///    ```
605 /// 2. Generate a for loop and rewrite the transfer op according to the
606 ///    corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be
607 ///    out-of-bounds, generate an if-check and handle both cases separately.
608 /// 3. Clean up according to the corresponding Strategy<OpTy>.
609 template <typename OpTy>
610 struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
611   using VectorToSCFPattern<OpTy>::VectorToSCFPattern;
612 
613   LogicalResult matchAndRewrite(OpTy xferOp,
614                                 PatternRewriter &rewriter) const override {
615     if (!xferOp->hasAttr(kPassLabel))
616       return failure();
617 
618     // Find and cast data buffer. How the buffer can be found depends on OpTy.
619     ImplicitLocOpBuilder locB(xferOp.getLoc(), rewriter);
620     auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
621     auto dataBufferType = dataBuffer.getType().template dyn_cast<MemRefType>();
622     auto castedDataType = unpackOneDim(dataBufferType);
623     auto castedDataBuffer =
624         locB.create<vector::TypeCastOp>(castedDataType, dataBuffer);
625 
626     // If the xferOp has a mask: Find and cast mask buffer.
627     Value castedMaskBuffer;
628     if (xferOp.mask()) {
629       auto maskBuffer = getMaskBuffer(xferOp);
630       auto maskBufferType =
631           maskBuffer.getType().template dyn_cast<MemRefType>();
632       if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) {
633         // Do not unpack a dimension of the mask, if:
634         // * To-be-unpacked transfer op dimension is a broadcast.
635         // * Mask is 1D, i.e., the mask cannot be further unpacked.
636         //   (That means that all remaining dimensions of the transfer op must
637         //   be broadcasted.)
638         castedMaskBuffer = maskBuffer;
639       } else {
640         auto castedMaskType = unpackOneDim(maskBufferType);
641         castedMaskBuffer =
642             locB.create<vector::TypeCastOp>(castedMaskType, maskBuffer);
643       }
644     }
645 
646     // Loop bounds and step.
647     auto lb = locB.create<ConstantIndexOp>(0);
648     auto ub = locB.create<ConstantIndexOp>(
649         castedDataType.getDimSize(castedDataType.getRank() - 1));
650     auto step = locB.create<ConstantIndexOp>(1);
651 
652     // Generate for loop.
653     locB.create<scf::ForOp>(
654         lb, ub, step, ValueRange(),
655         [&](OpBuilder &b, Location loc, Value iv, ValueRange /*loopState*/) {
656           generateInBoundsCheck(
657               b, xferOp, iv, unpackedDim(xferOp),
658               /*inBoundsCase=*/
659               [&](OpBuilder &b, Location loc) {
660                 // Create new transfer op.
661                 OpTy newXfer = Strategy<OpTy>::rewriteOp(
662                     b, this->options, xferOp, castedDataBuffer, iv);
663 
664                 // If old transfer op has a mask: Set mask on new transfer op.
665                 // Special case: If the mask of the old transfer op is 1D and
666                 // the
667                 //               unpacked dim is not a broadcast, no mask is
668                 //               needed on the new transfer op.
669                 if (xferOp.mask() && (xferOp.isBroadcastDim(0) ||
670                                       xferOp.getMaskType().getRank() > 1)) {
671                   OpBuilder::InsertionGuard guard(b);
672                   b.setInsertionPoint(newXfer); // Insert load before newXfer.
673 
674                   SmallVector<Value, 8> loadIndices;
675                   Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
676                   // In case of broadcast: Use same indices to load from memref
677                   // as before.
678                   if (!xferOp.isBroadcastDim(0))
679                     loadIndices.push_back(iv);
680 
681                   auto mask = b.create<memref::LoadOp>(loc, castedMaskBuffer,
682                                                        loadIndices);
683                   rewriter.updateRootInPlace(
684                       newXfer, [&]() { newXfer.maskMutable().assign(mask); });
685                 }
686               },
687               /*outOfBoundsCase=*/
688               [&](OpBuilder &b, Location /*loc*/) {
689                 Strategy<OpTy>::handleOutOfBoundsDim(b, xferOp,
690                                                      castedDataBuffer, iv);
691               });
692           b.create<scf::YieldOp>(loc);
693         });
694 
695     Strategy<OpTy>::cleanup(rewriter, xferOp);
696     return success();
697   }
698 };
699 
700 } // namespace lowering_n_d
701 
702 namespace lowering_n_d_unrolled {
703 
704 /// If the original transfer op has a mask, compute the mask of the new transfer
705 /// op (for the current iteration `i`) and assign it.
706 template <typename OpTy>
707 static void maybeAssignMask(OpBuilder &b, OpTy xferOp, OpTy newXferOp,
708                             int64_t i) {
709   if (!xferOp.mask())
710     return;
711 
712   if (xferOp.isBroadcastDim(0)) {
713     // To-be-unpacked dimension is a broadcast, which does not have a
714     // corresponding mask dimension. Mask attribute remains unchanged.
715     newXferOp.maskMutable().assign(xferOp.mask());
716     return;
717   }
718 
719   if (xferOp.getMaskType().getRank() > 1) {
720     // Unpack one dimension of the mask.
721     OpBuilder::InsertionGuard guard(b);
722     b.setInsertionPoint(newXferOp); // Insert load before newXfer.
723 
724     llvm::SmallVector<int64_t, 1> indices({i});
725     Location loc = xferOp.getLoc();
726     auto newMask = b.create<vector::ExtractOp>(loc, xferOp.mask(), indices);
727     newXferOp.maskMutable().assign(newMask);
728   }
729 
730   // If we end up here: The mask of the old transfer op is 1D and the unpacked
731   // dim is not a broadcast, so no mask is needed on the new transfer op.
732   // `generateInBoundsCheck` will have evaluated the mask already.
733 }
734 
735 /// Progressive lowering of vector TransferReadOp with unrolling: Unpack one
736 /// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no
737 /// memref buffer is allocated and the SCF loop is fully unrolled.
738 ///
739 /// ```
740 /// E.g.:
741 /// ```
742 /// %vec = vector.transfer_read %A[%a, %b, %c], %padding
743 ///     : memref<?x?x?xf32>, vector<5x4xf32>
744 /// ```
745 /// is rewritten to IR such as (simplified):
746 /// ```
747 /// %v_init = splat %padding : vector<5x4xf32>
748 /// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding
749 ///     : memref<?x?x?xf32>, vector<4xf32>
750 /// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32>
751 /// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding
752 ///     : memref<?x?x?xf32>, vector<4xf32>
753 /// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32>
754 /// ...
755 /// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding
756 ///     : memref<?x?x?xf32>, vector<4xf32>
757 /// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32>
758 /// ```
759 ///
760 /// Note: As an optimization, if the result of the original TransferReadOp
761 /// was directly inserted into another vector, no new %v_init vector is created.
762 /// Instead, the new TransferReadOp results are inserted into that vector.
763 struct UnrollTransferReadConversion
764     : public VectorToSCFPattern<TransferReadOp> {
765   using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern;
766 
767   /// Return the vector into which the newly created TransferReadOp results
768   /// are inserted.
769   Value getResultVector(TransferReadOp xferOp,
770                         PatternRewriter &rewriter) const {
771     if (auto insertOp = getInsertOp(xferOp))
772       return insertOp.dest();
773     Location loc = xferOp.getLoc();
774     return rewriter.create<SplatOp>(loc, xferOp.getVectorType(),
775                                     xferOp.padding());
776   }
777 
778   /// If the result of the TransferReadOp has exactly one user, which is a
779   /// vector::InsertOp, return that operation.
780   vector::InsertOp getInsertOp(TransferReadOp xferOp) const {
781     if (xferOp->hasOneUse()) {
782       Operation *xferOpUser = *xferOp->getUsers().begin();
783       if (auto insertOp = dyn_cast<vector::InsertOp>(xferOpUser))
784         return insertOp;
785     }
786 
787     return vector::InsertOp();
788   }
789 
790   /// If the result of the TransferReadOp has exactly one user, which is a
791   /// vector::InsertOp, return that operation's indices.
792   void getInsertionIndices(TransferReadOp xferOp,
793                            SmallVector<int64_t, 8> &indices) const {
794     if (auto insertOp = getInsertOp(xferOp)) {
795       llvm::for_each(insertOp.position(), [&](Attribute attr) {
796         indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
797       });
798     }
799   }
800 
801   /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
802   /// accesses, and broadcasts and transposes in permutation maps.
803   LogicalResult matchAndRewrite(TransferReadOp xferOp,
804                                 PatternRewriter &rewriter) const override {
805     if (xferOp.getVectorType().getRank() <= options.targetRank)
806       return failure();
807     if (xferOp.getShapedType().template isa<RankedTensorType>())
808       return failure();
809 
810     auto insertOp = getInsertOp(xferOp);
811     auto vec = getResultVector(xferOp, rewriter);
812     auto vecType = vec.getType().dyn_cast<VectorType>();
813     auto xferVecType = xferOp.getVectorType();
814     auto newXferVecType = VectorType::get(xferVecType.getShape().drop_front(),
815                                           xferVecType.getElementType());
816     int64_t dimSize = xferVecType.getShape()[0];
817 
818     // Generate fully unrolled loop of transfer ops.
819     Location loc = xferOp.getLoc();
820     for (int64_t i = 0; i < dimSize; ++i) {
821       Value iv = rewriter.create<ConstantIndexOp>(loc, i);
822 
823       vec = generateInBoundsCheck(
824           rewriter, xferOp, iv, unpackedDim(xferOp), TypeRange(vecType),
825           /*inBoundsCase=*/
826           [&](OpBuilder &b, Location loc) {
827             // Indices for the new transfer op.
828             SmallVector<Value, 8> xferIndices;
829             getXferIndices(b, xferOp, iv, xferIndices);
830 
831             // Indices for the new vector.insert op.
832             SmallVector<int64_t, 8> insertionIndices;
833             getInsertionIndices(xferOp, insertionIndices);
834             insertionIndices.push_back(i);
835 
836             auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
837             auto newXferOp = b.create<vector::TransferReadOp>(
838                 loc, newXferVecType, xferOp.source(), xferIndices,
839                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
840                 xferOp.padding(), Value(), inBoundsAttr);
841             maybeAssignMask(b, xferOp, newXferOp, i);
842             return b.create<vector::InsertOp>(loc, newXferOp, vec,
843                                               insertionIndices);
844           },
845           /*outOfBoundsCase=*/
846           [&](OpBuilder &b, Location loc) {
847             // Loop through original (unmodified) vector.
848             return vec;
849           });
850     }
851 
852     if (insertOp) {
853       // Rewrite single user of the old TransferReadOp, which was an InsertOp.
854       rewriter.replaceOp(insertOp, vec);
855       rewriter.eraseOp(xferOp);
856     } else {
857       rewriter.replaceOp(xferOp, vec);
858     }
859 
860     return success();
861   }
862 };
863 
864 /// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one
865 /// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no
866 /// memref buffer is allocated and the SCF loop is fully unrolled.
867 ///
868 /// ```
869 /// E.g.:
870 /// ```
871 /// vector.transfer_write %vec, %A[%a, %b, %c]
872 ///     : vector<5x4xf32>, memref<?x?x?xf32>
873 /// ```
874 /// is rewritten to IR such as (simplified):
875 /// ```
876 /// %v0 = vector.extract %vec[0] : vector<5x4xf32>
877 /// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...>
878 /// %v1 = vector.extract %vec[1] : vector<5x4xf32>
879 /// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...>
880 /// ...
881 /// %v4 = vector.extract %vec[4] : vector<5x4xf32>
882 /// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...>
883 /// ```
884 ///
885 /// Note: As an optimization, if the vector of the original TransferWriteOp
886 /// was directly extracted from another vector via an ExtractOp `a`, extract
887 /// the vectors for the newly generated TransferWriteOps from `a`'s input. By
888 /// doing so, `a` may become dead, and the number of ExtractOps generated during
889 /// recursive application of this pattern will be minimal.
890 struct UnrollTransferWriteConversion
891     : public VectorToSCFPattern<TransferWriteOp> {
892   using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern;
893 
894   /// Return the vector from which newly generated ExtracOps will extract.
895   Value getDataVector(TransferWriteOp xferOp) const {
896     if (auto extractOp = getExtractOp(xferOp))
897       return extractOp.vector();
898     return xferOp.vector();
899   }
900 
901   /// If the input of the given TransferWriteOp is an ExtractOp, return it.
902   vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const {
903     if (auto *op = xferOp.vector().getDefiningOp())
904       return dyn_cast<vector::ExtractOp>(op);
905     return vector::ExtractOp();
906   }
907 
908   /// If the input of the given TransferWriteOp is an ExtractOp, return its
909   /// indices.
910   void getExtractionIndices(TransferWriteOp xferOp,
911                             SmallVector<int64_t, 8> &indices) const {
912     if (auto extractOp = getExtractOp(xferOp)) {
913       llvm::for_each(extractOp.position(), [&](Attribute attr) {
914         indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
915       });
916     }
917   }
918 
919   /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
920   /// accesses, and broadcasts and transposes in permutation maps.
921   LogicalResult matchAndRewrite(TransferWriteOp xferOp,
922                                 PatternRewriter &rewriter) const override {
923     if (xferOp.getVectorType().getRank() <= options.targetRank)
924       return failure();
925     if (xferOp.getShapedType().template isa<RankedTensorType>())
926       return failure();
927 
928     auto vec = getDataVector(xferOp);
929     auto xferVecType = xferOp.getVectorType();
930     int64_t dimSize = xferVecType.getShape()[0];
931 
932     // Generate fully unrolled loop of transfer ops.
933     Location loc = xferOp.getLoc();
934     for (int64_t i = 0; i < dimSize; ++i) {
935       Value iv = rewriter.create<ConstantIndexOp>(loc, i);
936 
937       generateInBoundsCheck(
938           rewriter, xferOp, iv, unpackedDim(xferOp),
939           /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
940             // Indices for the new transfer op.
941             SmallVector<Value, 8> xferIndices;
942             getXferIndices(b, xferOp, iv, xferIndices);
943 
944             // Indices for the new vector.extract op.
945             SmallVector<int64_t, 8> extractionIndices;
946             getExtractionIndices(xferOp, extractionIndices);
947             extractionIndices.push_back(i);
948 
949             auto extracted =
950                 b.create<vector::ExtractOp>(loc, vec, extractionIndices);
951             auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
952 
953             auto newXferOp = b.create<vector::TransferWriteOp>(
954                 loc, Type(), extracted, xferOp.source(), xferIndices,
955                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
956                 inBoundsAttr);
957 
958             maybeAssignMask(b, xferOp, newXferOp, i);
959           });
960     }
961 
962     rewriter.eraseOp(xferOp);
963     return success();
964   }
965 };
966 
967 } // namespace lowering_n_d_unrolled
968 
969 namespace lowering_1_d {
970 
971 /// Compute the indices into the memref for the LoadOp/StoreOp generated as
972 /// part of TransferOp1dConversion. Return the memref dimension on which
973 /// the transfer is operating. A return value of None indicates a broadcast.
974 template <typename OpTy>
975 static Optional<int64_t>
976 get1dMemrefIndices(OpBuilder &b, OpTy xferOp, Value iv,
977                    SmallVector<Value, 8> &memrefIndices) {
978   auto indices = xferOp.indices();
979   auto map = xferOp.permutation_map();
980 
981   memrefIndices.append(indices.begin(), indices.end());
982   assert(map.getNumResults() == 1 &&
983          "Expected 1 permutation map result for 1D transfer");
984   if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
985     Location loc = xferOp.getLoc();
986     auto dim = expr.getPosition();
987     AffineExpr d0, d1;
988     bindDims(xferOp.getContext(), d0, d1);
989     Value offset = memrefIndices[dim];
990     memrefIndices[dim] = makeComposedAffineApply(b, loc, d0 + d1, {offset, iv});
991     return dim;
992   }
993 
994   assert(xferOp.isBroadcastDim(0) &&
995          "Expected AffineDimExpr or AffineConstantExpr");
996   return None;
997 }
998 
999 /// Codegen strategy for TransferOp1dConversion, depending on the
1000 /// operation.
1001 template <typename OpTy>
1002 struct Strategy1d;
1003 
1004 /// Codegen strategy for TransferReadOp.
1005 template <>
1006 struct Strategy1d<TransferReadOp> {
1007   static void generateForLoopBody(OpBuilder &b, Location loc,
1008                                   TransferReadOp xferOp, Value iv,
1009                                   ValueRange loopState) {
1010     SmallVector<Value, 8> indices;
1011     auto dim = get1dMemrefIndices(b, xferOp, iv, indices);
1012     Value ivI32 =
1013         b.create<IndexCastOp>(loc, IntegerType::get(b.getContext(), 32), iv);
1014     auto vec = loopState[0];
1015 
1016     // In case of out-of-bounds access, leave `vec` as is (was initialized with
1017     // padding value).
1018     auto nextVec = generateInBoundsCheck(
1019         b, xferOp, iv, dim, TypeRange(xferOp.getVectorType()),
1020         /*inBoundsCase=*/
1021         [&](OpBuilder &b, Location loc) {
1022           Value val = b.create<memref::LoadOp>(loc, xferOp.source(), indices);
1023           return b.create<vector::InsertElementOp>(loc, val, vec, ivI32);
1024         },
1025         /*outOfBoundsCase=*/
1026         [&](OpBuilder & /*b*/, Location loc) { return vec; });
1027     b.create<scf::YieldOp>(loc, nextVec);
1028   }
1029 
1030   static Value initialLoopState(OpBuilder &b, TransferReadOp xferOp) {
1031     // Inititalize vector with padding value.
1032     Location loc = xferOp.getLoc();
1033     return b.create<SplatOp>(loc, xferOp.getVectorType(), xferOp.padding());
1034   }
1035 };
1036 
1037 /// Codegen strategy for TransferWriteOp.
1038 template <>
1039 struct Strategy1d<TransferWriteOp> {
1040   static void generateForLoopBody(OpBuilder &b, Location loc,
1041                                   TransferWriteOp xferOp, Value iv,
1042                                   ValueRange /*loopState*/) {
1043     SmallVector<Value, 8> indices;
1044     auto dim = get1dMemrefIndices(b, xferOp, iv, indices);
1045     Value ivI32 =
1046         b.create<IndexCastOp>(loc, IntegerType::get(b.getContext(), 32), iv);
1047 
1048     // Nothing to do in case of out-of-bounds access.
1049     generateInBoundsCheck(
1050         b, xferOp, iv, dim,
1051         /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
1052           auto val =
1053               b.create<vector::ExtractElementOp>(loc, xferOp.vector(), ivI32);
1054           b.create<memref::StoreOp>(loc, val, xferOp.source(), indices);
1055         });
1056     b.create<scf::YieldOp>(loc);
1057   }
1058 
1059   static Value initialLoopState(OpBuilder &b, TransferWriteOp xferOp) {
1060     return Value();
1061   }
1062 };
1063 
1064 /// Return true if the last dimension of the MemRefType has unit stride.
1065 static bool isLastMemrefDimUnitStride(MemRefType type) {
1066   int64_t offset;
1067   SmallVector<int64_t, 4> strides;
1068   auto successStrides = getStridesAndOffset(type, strides, offset);
1069   return succeeded(successStrides) && (strides.empty() || strides.back() == 1);
1070 }
1071 
1072 /// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is
1073 /// necessary in cases where a 1D vector transfer op cannot be lowered into
1074 /// vector load/stores due to non-unit strides or broadcasts:
1075 ///
1076 /// * Transfer dimension is not the last memref dimension
1077 /// * Transfer dimension is a broadcast (i.e., scalar load + broadcast)
1078 /// * Memref has a layout map with non-unit stride on the last dimension
1079 ///
1080 /// This pattern generates IR as follows:
1081 ///
1082 /// 1. Generate a for loop iterating over each vector element.
1083 /// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp,
1084 ///    depending on OpTy.
1085 ///
1086 /// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp
1087 ///       can be generated instead of TransferOp1dConversion. Add such a pattern
1088 ///       to ConvertVectorToLLVM.
1089 ///
1090 /// E.g.:
1091 /// ```
1092 /// vector.transfer_write %vec, %A[%a, %b]
1093 ///    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
1094 ///    : vector<9xf32>, memref<?x?xf32>
1095 /// ```
1096 /// Is rewritten to approximately the following pseudo-IR:
1097 /// ```
1098 /// for i = 0 to 9 {
1099 ///   %t = vector.extractelement %vec[i] : vector<9xf32>
1100 ///   memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32>
1101 /// }
1102 /// ```
1103 template <typename OpTy>
1104 struct TransferOp1dConversion : public VectorToSCFPattern<OpTy> {
1105   using VectorToSCFPattern<OpTy>::VectorToSCFPattern;
1106 
1107   LogicalResult matchAndRewrite(OpTy xferOp,
1108                                 PatternRewriter &rewriter) const override {
1109     auto map = xferOp.permutation_map();
1110     auto memRefType = xferOp.getShapedType().template dyn_cast<MemRefType>();
1111 
1112     if (!memRefType)
1113       return failure();
1114     if (xferOp.getVectorType().getRank() != 1)
1115       return failure();
1116     if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType))
1117       return failure(); // Handled by ConvertVectorToLLVM
1118 
1119     // Loop bounds, step, state...
1120     Location loc = xferOp.getLoc();
1121     auto vecType = xferOp.getVectorType();
1122     auto lb = rewriter.create<ConstantIndexOp>(loc, 0);
1123     auto ub = rewriter.create<ConstantIndexOp>(loc, vecType.getDimSize(0));
1124     auto step = rewriter.create<ConstantIndexOp>(loc, 1);
1125     auto loopState = Strategy1d<OpTy>::initialLoopState(rewriter, xferOp);
1126 
1127     // Generate for loop.
1128     rewriter.replaceOpWithNewOp<scf::ForOp>(
1129         xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
1130         [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) {
1131           Strategy1d<OpTy>::generateForLoopBody(b, loc, xferOp, iv, loopState);
1132         });
1133 
1134     return success();
1135   }
1136 };
1137 
1138 } // namespace lowering_1_d
1139 } // namespace
1140 
1141 namespace mlir {
1142 
1143 void populateVectorToSCFConversionPatterns(
1144     RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) {
1145   if (options.unroll) {
1146     patterns.add<lowering_n_d_unrolled::UnrollTransferReadConversion,
1147                  lowering_n_d_unrolled::UnrollTransferWriteConversion>(
1148         patterns.getContext(), options);
1149   } else {
1150     patterns.add<lowering_n_d::PrepareTransferReadConversion,
1151                  lowering_n_d::PrepareTransferWriteConversion,
1152                  lowering_n_d::TransferOpConversion<TransferReadOp>,
1153                  lowering_n_d::TransferOpConversion<TransferWriteOp>>(
1154         patterns.getContext(), options);
1155   }
1156 
1157   if (options.targetRank == 1) {
1158     patterns.add<lowering_1_d::TransferOp1dConversion<TransferReadOp>,
1159                  lowering_1_d::TransferOp1dConversion<TransferWriteOp>>(
1160         patterns.getContext(), options);
1161   }
1162 }
1163 
1164 } // namespace mlir
1165 
1166 namespace {
1167 
1168 struct ConvertVectorToSCFPass
1169     : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
1170   ConvertVectorToSCFPass() = default;
1171   ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
1172     this->fullUnroll = options.unroll;
1173     this->targetRank = options.targetRank;
1174     this->lowerPermutationMaps = options.lowerPermutationMaps;
1175   }
1176 
1177   void runOnFunction() override {
1178     VectorTransferToSCFOptions options;
1179     options.unroll = fullUnroll;
1180     options.targetRank = targetRank;
1181     options.lowerPermutationMaps = lowerPermutationMaps;
1182 
1183     // Lower permutation maps first.
1184     if (lowerPermutationMaps) {
1185       RewritePatternSet lowerTransferPatterns(getFunction().getContext());
1186       mlir::vector::populateVectorTransferPermutationMapLoweringPatterns(
1187           lowerTransferPatterns);
1188       (void)applyPatternsAndFoldGreedily(getFunction(),
1189                                          std::move(lowerTransferPatterns));
1190     }
1191 
1192     RewritePatternSet patterns(getFunction().getContext());
1193     populateVectorToSCFConversionPatterns(patterns, options);
1194     (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
1195   }
1196 };
1197 
1198 } // namespace
1199 
1200 std::unique_ptr<Pass>
1201 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
1202   return std::make_unique<ConvertVectorToSCFPass>(options);
1203 }
1204