1 //===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements lowering of vector transfer operations to SCF.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <type_traits>
14 
15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
16 
17 #include "../PassDetail.h"
18 #include "mlir/Dialect/Affine/IR/AffineOps.h"
19 #include "mlir/Dialect/Affine/Utils.h"
20 #include "mlir/Dialect/SCF/SCF.h"
21 #include "mlir/Dialect/Vector/VectorOps.h"
22 #include "mlir/Dialect/Vector/VectorUtils.h"
23 #include "mlir/IR/Builders.h"
24 #include "mlir/IR/ImplicitLocOpBuilder.h"
25 #include "mlir/Pass/Pass.h"
26 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
27 #include "mlir/Transforms/Passes.h"
28 
29 using namespace mlir;
30 using vector::TransferReadOp;
31 using vector::TransferWriteOp;
32 
33 namespace {
34 
35 /// Attribute name used for labeling transfer ops during progressive lowering.
36 static const char kPassLabel[] = "__vector_to_scf_lowering__";
37 
38 /// Patterns that inherit from this struct have access to
39 /// VectorTransferToSCFOptions.
40 template <typename OpTy>
41 struct VectorToSCFPattern : public OpRewritePattern<OpTy> {
42   explicit VectorToSCFPattern(MLIRContext *context,
43                               VectorTransferToSCFOptions opt)
44       : OpRewritePattern<OpTy>(context), options(opt) {}
45 
46   VectorTransferToSCFOptions options;
47 };
48 
49 /// Given a vector transfer op, calculate which dimension of the `source`
50 /// memref should be unpacked in the next application of TransferOpConversion.
51 /// A return value of None indicates a broadcast.
52 template <typename OpTy>
53 static Optional<int64_t> unpackedDim(OpTy xferOp) {
54   auto map = xferOp.permutation_map();
55   if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
56     return expr.getPosition();
57   }
58   assert(xferOp.isBroadcastDim(0) &&
59          "Expected AffineDimExpr or AffineConstantExpr");
60   return None;
61 }
62 
63 /// Compute the permutation map for the new (N-1)-D vector transfer op. This
64 /// map is identical to the current permutation map, but the first result is
65 /// omitted.
66 template <typename OpTy>
67 static AffineMap unpackedPermutationMap(OpBuilder &b, OpTy xferOp) {
68   auto map = xferOp.permutation_map();
69   return AffineMap::get(map.getNumDims(), 0, map.getResults().drop_front(),
70                         b.getContext());
71 }
72 
73 /// Calculate the indices for the new vector transfer op.
74 ///
75 /// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ...
76 ///       --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32>
77 ///                                 ^^^^^^
78 ///              `iv` is the iteration variable of the (new) surrounding loop.
79 template <typename OpTy>
80 static void getXferIndices(OpBuilder &b, OpTy xferOp, Value iv,
81                            SmallVector<Value, 8> &indices) {
82   typename OpTy::Adaptor adaptor(xferOp);
83   // Corresponding memref dim of the vector dim that is unpacked.
84   auto dim = unpackedDim(xferOp);
85   auto prevIndices = adaptor.indices();
86   indices.append(prevIndices.begin(), prevIndices.end());
87 
88   Location loc = xferOp.getLoc();
89   bool isBroadcast = !dim.hasValue();
90   if (!isBroadcast) {
91     AffineExpr d0, d1;
92     bindDims(xferOp.getContext(), d0, d1);
93     Value offset = adaptor.indices()[dim.getValue()];
94     indices[dim.getValue()] =
95         makeComposedAffineApply(b, loc, d0 + d1, {offset, iv});
96   }
97 }
98 
99 static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal,
100                             Value value) {
101   if (hasRetVal) {
102     b.create<scf::YieldOp>(loc, value);
103   } else {
104     b.create<scf::YieldOp>(loc);
105   }
106 }
107 
108 /// Generates a boolean Value that is true if the iv-th bit in xferOp's mask
109 /// is set to true. No such check is generated under following circumstances:
110 /// * xferOp does not have a mask.
111 /// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is
112 ///   computed and attached to the new transfer op in the pattern.)
113 /// * The to-be-unpacked dim of xferOp is a broadcast.
114 template <typename OpTy>
115 static Value generateMaskCheck(OpBuilder &b, OpTy xferOp, Value iv) {
116   if (!xferOp.mask())
117     return Value();
118   if (xferOp.getMaskType().getRank() != 1)
119     return Value();
120   if (xferOp.isBroadcastDim(0))
121     return Value();
122 
123   Location loc = xferOp.getLoc();
124   Value ivI32 =
125       b.create<IndexCastOp>(loc, IntegerType::get(b.getContext(), 32), iv);
126   return b.create<vector::ExtractElementOp>(loc, xferOp.mask(), ivI32);
127 }
128 
129 /// Helper function TransferOpConversion and TransferOp1dConversion.
130 /// Generate an in-bounds check if the transfer op may go out-of-bounds on the
131 /// specified dimension `dim` with the loop iteration variable `iv`.
132 /// E.g., when unpacking dimension 0 from:
133 /// ```
134 /// %vec = vector.transfer_read %A[%a, %b] %cst
135 ///     : vector<5x4xf32>, memref<?x?xf32>
136 /// ```
137 /// An if check similar to this will be generated inside the loop:
138 /// ```
139 /// %d = memref.dim %A, %c0 : memref<?x?xf32>
140 /// if (%a + iv < %d) {
141 ///   (in-bounds case)
142 /// } else {
143 ///   (out-of-bounds case)
144 /// }
145 /// ```
146 ///
147 /// If the transfer is 1D and has a mask, this function generates a more complex
148 /// check also accounts for potentially masked out elements.
149 ///
150 /// This function variant returns the value returned by `inBoundsCase` or
151 /// `outOfBoundsCase`. The MLIR type of the return value must be specified in
152 /// `resultTypes`.
153 template <typename OpTy>
154 static Value generateInBoundsCheck(
155     OpBuilder &b, OpTy xferOp, Value iv, Optional<int64_t> dim,
156     TypeRange resultTypes,
157     function_ref<Value(OpBuilder &, Location)> inBoundsCase,
158     function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
159   bool hasRetVal = !resultTypes.empty();
160   Value cond; // Condition to be built...
161 
162   // Condition check 1: Access in-bounds?
163   bool isBroadcast = !dim.hasValue(); // No in-bounds check for broadcasts.
164   Location loc = xferOp.getLoc();
165   ImplicitLocOpBuilder lb(xferOp.getLoc(), b);
166   if (!xferOp.isDimInBounds(0) && !isBroadcast) {
167     Value memrefDim = lb.create<memref::DimOp>(xferOp.source(), *dim);
168     AffineExpr d0, d1;
169     bindDims(xferOp.getContext(), d0, d1);
170     Value base = xferOp.indices()[dim.getValue()];
171     Value memrefIdx = makeComposedAffineApply(b, loc, d0 + d1, {base, iv});
172     cond = lb.create<CmpIOp>(CmpIPredicate::sgt, memrefDim, memrefIdx);
173   }
174 
175   // Condition check 2: Masked in?
176   if (auto maskCond = generateMaskCheck(b, xferOp, iv)) {
177     if (cond)
178       cond = lb.create<AndOp>(cond, maskCond);
179     else
180       cond = maskCond;
181   }
182 
183   // If the condition is non-empty, generate an SCF::IfOp.
184   if (cond) {
185     auto check = lb.create<scf::IfOp>(
186         resultTypes, cond,
187         /*thenBuilder=*/
188         [&](OpBuilder &b, Location loc) {
189           maybeYieldValue(b, loc, hasRetVal, inBoundsCase(b, loc));
190         },
191         /*elseBuilder=*/
192         [&](OpBuilder &b, Location loc) {
193           if (outOfBoundsCase) {
194             maybeYieldValue(b, loc, hasRetVal, outOfBoundsCase(b, loc));
195           } else {
196             b.create<scf::YieldOp>(loc);
197           }
198         });
199 
200     return hasRetVal ? check.getResult(0) : Value();
201   }
202 
203   // Condition is empty, no need for an SCF::IfOp.
204   return inBoundsCase(b, loc);
205 }
206 
207 /// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have
208 /// a return value. Consequently, this function does not have a return value.
209 template <typename OpTy>
210 static void generateInBoundsCheck(
211     OpBuilder &b, OpTy xferOp, Value iv, Optional<int64_t> dim,
212     function_ref<void(OpBuilder &, Location)> inBoundsCase,
213     function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
214   generateInBoundsCheck(
215       b, xferOp, iv, dim, /*resultTypes=*/TypeRange(),
216       /*inBoundsCase=*/
217       [&](OpBuilder &b, Location loc) {
218         inBoundsCase(b, loc);
219         return Value();
220       },
221       /*outOfBoundsCase=*/
222       [&](OpBuilder &b, Location loc) {
223         if (outOfBoundsCase)
224           outOfBoundsCase(b, loc);
225         return Value();
226       });
227 }
228 
229 /// Given an ArrayAttr, return a copy where the first element is dropped.
230 static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) {
231   if (!attr)
232     return attr;
233   return ArrayAttr::get(b.getContext(), attr.getValue().drop_front());
234 }
235 
236 /// Add the pass label to a vector transfer op if its rank is not the target
237 /// rank.
238 template <typename OpTy>
239 static void maybeApplyPassLabel(OpBuilder &b, OpTy newXferOp,
240                                 unsigned targetRank) {
241   if (newXferOp.getVectorType().getRank() > targetRank)
242     newXferOp->setAttr(kPassLabel, b.getUnitAttr());
243 }
244 
245 namespace lowering_n_d {
246 
247 /// Helper data structure for data and mask buffers.
248 struct BufferAllocs {
249   Value dataBuffer;
250   Value maskBuffer;
251 };
252 
253 /// Allocate temporary buffers for data (vector) and mask (if present).
254 /// TODO: Parallelism and threadlocal considerations.
255 template <typename OpTy>
256 static BufferAllocs allocBuffers(OpBuilder &b, OpTy xferOp) {
257   Location loc = xferOp.getLoc();
258   OpBuilder::InsertionGuard guard(b);
259   Operation *scope =
260       xferOp->template getParentWithTrait<OpTrait::AutomaticAllocationScope>();
261   assert(scope && "Expected op to be inside automatic allocation scope");
262   b.setInsertionPointToStart(&scope->getRegion(0).front());
263 
264   BufferAllocs result;
265   auto bufferType = MemRefType::get({}, xferOp.getVectorType());
266   result.dataBuffer = b.create<memref::AllocaOp>(loc, bufferType);
267 
268   if (xferOp.mask()) {
269     auto maskType = MemRefType::get({}, xferOp.mask().getType());
270     auto maskBuffer = b.create<memref::AllocaOp>(loc, maskType);
271     b.setInsertionPoint(xferOp);
272     b.create<memref::StoreOp>(loc, xferOp.mask(), maskBuffer);
273     result.maskBuffer = b.create<memref::LoadOp>(loc, maskBuffer);
274   }
275 
276   return result;
277 }
278 
279 /// Given a MemRefType with VectorType element type, unpack one dimension from
280 /// the VectorType into the MemRefType.
281 ///
282 /// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>>
283 static MemRefType unpackOneDim(MemRefType type) {
284   auto vectorType = type.getElementType().dyn_cast<VectorType>();
285   auto memrefShape = type.getShape();
286   SmallVector<int64_t, 8> newMemrefShape;
287   newMemrefShape.append(memrefShape.begin(), memrefShape.end());
288   newMemrefShape.push_back(vectorType.getDimSize(0));
289   return MemRefType::get(newMemrefShape,
290                          VectorType::get(vectorType.getShape().drop_front(),
291                                          vectorType.getElementType()));
292 }
293 
294 /// Given a transfer op, find the memref from which the mask is loaded. This
295 /// is similar to Strategy<TransferWriteOp>::getBuffer.
296 template <typename OpTy>
297 static Value getMaskBuffer(OpTy xferOp) {
298   assert(xferOp.mask() && "Expected that transfer op has mask");
299   auto loadOp = xferOp.mask().template getDefiningOp<memref::LoadOp>();
300   assert(loadOp && "Expected transfer op mask produced by LoadOp");
301   return loadOp.getMemRef();
302 }
303 
304 /// Codegen strategy, depending on the operation.
305 template <typename OpTy>
306 struct Strategy;
307 
308 /// Code strategy for vector TransferReadOp.
309 template <>
310 struct Strategy<TransferReadOp> {
311   /// Find the StoreOp that is used for writing the current TransferReadOp's
312   /// result to the temporary buffer allocation.
313   static memref::StoreOp getStoreOp(TransferReadOp xferOp) {
314     assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp");
315     auto storeOp = dyn_cast<memref::StoreOp>((*xferOp->use_begin()).getOwner());
316     assert(storeOp && "Expected TransferReadOp result used by StoreOp");
317     return storeOp;
318   }
319 
320   /// Find the temporary buffer allocation. All labeled TransferReadOps are
321   /// used like this, where %buf is either the buffer allocation or a type cast
322   /// of the buffer allocation:
323   /// ```
324   /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ...
325   /// memref.store %vec, %buf[...] ...
326   /// ```
327   static Value getBuffer(TransferReadOp xferOp) {
328     return getStoreOp(xferOp).getMemRef();
329   }
330 
331   /// Retrieve the indices of the current StoreOp that stores into the buffer.
332   static void getBufferIndices(TransferReadOp xferOp,
333                                SmallVector<Value, 8> &indices) {
334     auto storeOp = getStoreOp(xferOp);
335     auto prevIndices = memref::StoreOpAdaptor(storeOp).indices();
336     indices.append(prevIndices.begin(), prevIndices.end());
337   }
338 
339   /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds
340   /// accesses on the to-be-unpacked dimension.
341   ///
342   /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration
343   ///    variable `iv`.
344   /// 2. Store the result into the (already `vector.type_cast`ed) buffer.
345   ///
346   /// E.g.:
347   /// ```
348   /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst
349   ///     : memref<?x?x?xf32>, vector<4x3xf32>
350   /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>>
351   /// ```
352   /// Is rewritten to:
353   /// ```
354   /// %casted = vector.type_cast %buf
355   ///     : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
356   /// for %j = 0 to 4 {
357   ///   %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst
358   ///       : memref<?x?x?xf32>, vector<3xf32>
359   ///   memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>>
360   /// }
361   /// ```
362   ///
363   /// Note: The loop and type cast are generated in TransferOpConversion.
364   ///       The original TransferReadOp and store op are deleted in `cleanup`.
365   /// Note: The `mask` operand is set in TransferOpConversion.
366   static TransferReadOp rewriteOp(OpBuilder &b,
367                                   VectorTransferToSCFOptions options,
368                                   TransferReadOp xferOp, Value buffer,
369                                   Value iv) {
370     SmallVector<Value, 8> storeIndices;
371     getBufferIndices(xferOp, storeIndices);
372     storeIndices.push_back(iv);
373 
374     SmallVector<Value, 8> xferIndices;
375     getXferIndices(b, xferOp, iv, xferIndices);
376 
377     Location loc = xferOp.getLoc();
378     auto bufferType = buffer.getType().dyn_cast<ShapedType>();
379     auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
380     auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
381     auto newXferOp = b.create<vector::TransferReadOp>(
382         loc, vecType, xferOp.source(), xferIndices,
383         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), xferOp.padding(),
384         Value(), inBoundsAttr);
385 
386     maybeApplyPassLabel(b, newXferOp, options.targetRank);
387 
388     b.create<memref::StoreOp>(loc, newXferOp.vector(), buffer, storeIndices);
389     return newXferOp;
390   }
391 
392   /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write
393   /// padding value to the temporary buffer.
394   static void handleOutOfBoundsDim(OpBuilder &b, TransferReadOp xferOp,
395                                    Value buffer, Value iv) {
396     SmallVector<Value, 8> storeIndices;
397     getBufferIndices(xferOp, storeIndices);
398     storeIndices.push_back(iv);
399 
400     Location loc = xferOp.getLoc();
401     auto bufferType = buffer.getType().dyn_cast<ShapedType>();
402     auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
403     auto vec = b.create<SplatOp>(loc, vecType, xferOp.padding());
404     b.create<memref::StoreOp>(loc, vec, buffer, storeIndices);
405   }
406 
407   /// Cleanup after rewriting the op.
408   static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp) {
409     rewriter.eraseOp(getStoreOp(xferOp));
410     rewriter.eraseOp(xferOp);
411   }
412 };
413 
414 /// Codegen strategy for vector TransferWriteOp.
415 template <>
416 struct Strategy<TransferWriteOp> {
417   /// Find the temporary buffer allocation. All labeled TransferWriteOps are
418   /// used like this, where %buf is either the buffer allocation or a type cast
419   /// of the buffer allocation:
420   /// ```
421   /// %vec = memref.load %buf[...] ...
422   /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ...
423   /// ```
424   static Value getBuffer(TransferWriteOp xferOp) {
425     auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
426     assert(loadOp && "Expected transfer op vector produced by LoadOp");
427     return loadOp.getMemRef();
428   }
429 
430   /// Retrieve the indices of the current LoadOp that loads from the buffer.
431   static void getBufferIndices(TransferWriteOp xferOp,
432                                SmallVector<Value, 8> &indices) {
433     auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
434     auto prevIndices = memref::LoadOpAdaptor(loadOp).indices();
435     indices.append(prevIndices.begin(), prevIndices.end());
436   }
437 
438   /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds
439   /// accesses on the to-be-unpacked dimension.
440   ///
441   /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer,
442   ///    using the loop iteration variable `iv`.
443   /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back
444   ///    to memory.
445   ///
446   /// Note: For more details, see comments on Strategy<TransferReadOp>.
447   static TransferWriteOp rewriteOp(OpBuilder &b,
448                                    VectorTransferToSCFOptions options,
449                                    TransferWriteOp xferOp, Value buffer,
450                                    Value iv) {
451     SmallVector<Value, 8> loadIndices;
452     getBufferIndices(xferOp, loadIndices);
453     loadIndices.push_back(iv);
454 
455     SmallVector<Value, 8> xferIndices;
456     getXferIndices(b, xferOp, iv, xferIndices);
457 
458     Location loc = xferOp.getLoc();
459     auto vec = b.create<memref::LoadOp>(loc, buffer, loadIndices);
460     auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
461     auto newXferOp = b.create<vector::TransferWriteOp>(
462         loc, Type(), vec, xferOp.source(), xferIndices,
463         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
464         inBoundsAttr);
465 
466     maybeApplyPassLabel(b, newXferOp, options.targetRank);
467 
468     return newXferOp;
469   }
470 
471   /// Handle out-of-bounds accesses on the to-be-unpacked dimension.
472   static void handleOutOfBoundsDim(OpBuilder &b, TransferWriteOp xferOp,
473                                    Value buffer, Value iv) {}
474 
475   /// Cleanup after rewriting the op.
476   static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp) {
477     rewriter.eraseOp(xferOp);
478   }
479 };
480 
481 template <typename OpTy>
482 LogicalResult checkPrepareXferOp(OpTy xferOp,
483                                  VectorTransferToSCFOptions options) {
484   if (xferOp->hasAttr(kPassLabel))
485     return failure();
486   if (xferOp.getVectorType().getRank() <= options.targetRank)
487     return failure();
488   return success();
489 }
490 
491 /// Prepare a TransferReadOp for progressive lowering.
492 ///
493 /// 1. Allocate a temporary buffer.
494 /// 2. Label the TransferReadOp, marking it eligible for progressive lowering.
495 /// 3. Store the result of the TransferReadOp into the temporary buffer.
496 /// 4. Load the result from the temporary buffer and replace all uses of the
497 ///    original TransferReadOp with this load.
498 ///
499 /// E.g.:
500 /// ```
501 /// %vec = vector.transfer_read %A[%a, %b, %c], %cst
502 ///     : vector<5x4xf32>, memref<?x?x?xf32>
503 /// ```
504 /// is rewritten to:
505 /// ```
506 /// %0 = memref.alloca() : memref<vector<5x4xf32>>
507 /// %1 = vector.transfer_read %A[%a, %b, %c], %cst
508 ///     { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32>
509 /// memref.store %1, %0[] : memref<vector<5x4xf32>>
510 /// %vec = memref.load %0[] : memref<vector<5x4xf32>>
511 /// ```
512 ///
513 /// Note: A second temporary buffer may be allocated for the `mask` operand.
514 struct PrepareTransferReadConversion
515     : public VectorToSCFPattern<TransferReadOp> {
516   using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern;
517 
518   LogicalResult matchAndRewrite(TransferReadOp xferOp,
519                                 PatternRewriter &rewriter) const override {
520     if (checkPrepareXferOp(xferOp, options).failed())
521       return failure();
522 
523     auto buffers = allocBuffers(rewriter, xferOp);
524     auto *newXfer = rewriter.clone(*xferOp.getOperation());
525     newXfer->setAttr(kPassLabel, rewriter.getUnitAttr());
526     if (xferOp.mask()) {
527       dyn_cast<TransferReadOp>(newXfer).maskMutable().assign(
528           buffers.maskBuffer);
529     }
530 
531     Location loc = xferOp.getLoc();
532     rewriter.create<memref::StoreOp>(loc, newXfer->getResult(0),
533                                      buffers.dataBuffer);
534     rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);
535 
536     return success();
537   }
538 };
539 
540 /// Prepare a TransferWriteOp for progressive lowering.
541 ///
542 /// 1. Allocate a temporary buffer.
543 /// 2. Store the vector into the buffer.
544 /// 3. Load the vector from the buffer again.
545 /// 4. Use the loaded vector as a TransferWriteOp operand and label the op,
546 ///    marking it eligible for progressive lowering via TransferOpConversion.
547 ///
548 /// E.g.:
549 /// ```
550 /// vector.transfer_write %vec, %A[%a, %b, %c]
551 ///     : vector<5x4xf32>, memref<?x?x?xf32>
552 /// ```
553 /// is rewritten to:
554 /// ```
555 /// %0 = memref.alloca() : memref<vector<5x4xf32>>
556 /// memref.store %vec, %0[] : memref<vector<5x4xf32>>
557 /// %1 = memref.load %0[] : memref<vector<5x4xf32>>
558 /// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
559 ///     : vector<5x4xf32>, memref<?x?x?xf32>
560 /// ```
561 ///
562 /// Note: A second temporary buffer may be allocated for the `mask` operand.
563 struct PrepareTransferWriteConversion
564     : public VectorToSCFPattern<TransferWriteOp> {
565   using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern;
566 
567   LogicalResult matchAndRewrite(TransferWriteOp xferOp,
568                                 PatternRewriter &rewriter) const override {
569     if (checkPrepareXferOp(xferOp, options).failed())
570       return failure();
571 
572     Location loc = xferOp.getLoc();
573     auto buffers = allocBuffers(rewriter, xferOp);
574     rewriter.create<memref::StoreOp>(loc, xferOp.vector(), buffers.dataBuffer);
575     auto loadedVec = rewriter.create<memref::LoadOp>(loc, buffers.dataBuffer);
576     rewriter.updateRootInPlace(xferOp, [&]() {
577       xferOp.vectorMutable().assign(loadedVec);
578       xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
579     });
580 
581     if (xferOp.mask()) {
582       rewriter.updateRootInPlace(
583           xferOp, [&]() { xferOp.maskMutable().assign(buffers.maskBuffer); });
584     }
585 
586     return success();
587   }
588 };
589 
590 /// Progressive lowering of vector transfer ops: Unpack one dimension.
591 ///
592 /// 1. Unpack one dimension from the current buffer type and cast the buffer
593 ///    to that new type. E.g.:
594 ///    ```
595 ///    %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>>
596 ///    vector.transfer_write %vec ...
597 ///    ```
598 ///    The following cast is generated:
599 ///    ```
600 ///    %casted = vector.type_cast %0
601 ///        : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
602 ///    ```
603 /// 2. Generate a for loop and rewrite the transfer op according to the
604 ///    corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be
605 ///    out-of-bounds, generate an if-check and handle both cases separately.
606 /// 3. Clean up according to the corresponding Strategy<OpTy>.
607 template <typename OpTy>
608 struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
609   using VectorToSCFPattern<OpTy>::VectorToSCFPattern;
610 
611   LogicalResult matchAndRewrite(OpTy xferOp,
612                                 PatternRewriter &rewriter) const override {
613     if (!xferOp->hasAttr(kPassLabel))
614       return failure();
615 
616     // Find and cast data buffer. How the buffer can be found depends on OpTy.
617     ImplicitLocOpBuilder locB(xferOp.getLoc(), rewriter);
618     auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
619     auto dataBufferType = dataBuffer.getType().template dyn_cast<MemRefType>();
620     auto castedDataType = unpackOneDim(dataBufferType);
621     auto castedDataBuffer =
622         locB.create<vector::TypeCastOp>(castedDataType, dataBuffer);
623 
624     // If the xferOp has a mask: Find and cast mask buffer.
625     Value castedMaskBuffer;
626     if (xferOp.mask()) {
627       auto maskBuffer = getMaskBuffer(xferOp);
628       auto maskBufferType =
629           maskBuffer.getType().template dyn_cast<MemRefType>();
630       if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) {
631         // Do not unpack a dimension of the mask, if:
632         // * To-be-unpacked transfer op dimension is a broadcast.
633         // * Mask is 1D, i.e., the mask cannot be further unpacked.
634         //   (That means that all remaining dimensions of the transfer op must
635         //   be broadcasted.)
636         castedMaskBuffer = maskBuffer;
637       } else {
638         auto castedMaskType = unpackOneDim(maskBufferType);
639         castedMaskBuffer =
640             locB.create<vector::TypeCastOp>(castedMaskType, maskBuffer);
641       }
642     }
643 
644     // Loop bounds and step.
645     auto lb = locB.create<ConstantIndexOp>(0);
646     auto ub = locB.create<ConstantIndexOp>(
647         castedDataType.getDimSize(castedDataType.getRank() - 1));
648     auto step = locB.create<ConstantIndexOp>(1);
649 
650     // Generate for loop.
651     locB.create<scf::ForOp>(
652         lb, ub, step, ValueRange(),
653         [&](OpBuilder &b, Location loc, Value iv, ValueRange /*loopState*/) {
654           generateInBoundsCheck(
655               b, xferOp, iv, unpackedDim(xferOp),
656               /*inBoundsCase=*/
657               [&](OpBuilder &b, Location loc) {
658                 // Create new transfer op.
659                 OpTy newXfer = Strategy<OpTy>::rewriteOp(
660                     b, this->options, xferOp, castedDataBuffer, iv);
661 
662                 // If old transfer op has a mask: Set mask on new transfer op.
663                 // Special case: If the mask of the old transfer op is 1D and
664                 // the
665                 //               unpacked dim is not a broadcast, no mask is
666                 //               needed on the new transfer op.
667                 if (xferOp.mask() && (xferOp.isBroadcastDim(0) ||
668                                       xferOp.getMaskType().getRank() > 1)) {
669                   OpBuilder::InsertionGuard guard(b);
670                   b.setInsertionPoint(newXfer); // Insert load before newXfer.
671 
672                   SmallVector<Value, 8> loadIndices;
673                   Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
674                   // In case of broadcast: Use same indices to load from memref
675                   // as before.
676                   if (!xferOp.isBroadcastDim(0))
677                     loadIndices.push_back(iv);
678 
679                   auto mask = b.create<memref::LoadOp>(loc, castedMaskBuffer,
680                                                        loadIndices);
681                   rewriter.updateRootInPlace(
682                       newXfer, [&]() { newXfer.maskMutable().assign(mask); });
683                 }
684               },
685               /*outOfBoundsCase=*/
686               [&](OpBuilder &b, Location /*loc*/) {
687                 Strategy<OpTy>::handleOutOfBoundsDim(b, xferOp,
688                                                      castedDataBuffer, iv);
689               });
690           b.create<scf::YieldOp>(loc);
691         });
692 
693     Strategy<OpTy>::cleanup(rewriter, xferOp);
694     return success();
695   }
696 };
697 
698 } // namespace lowering_n_d
699 
700 namespace lowering_n_d_unrolled {
701 
702 /// If the original transfer op has a mask, compute the mask of the new transfer
703 /// op (for the current iteration `i`) and assign it.
704 template <typename OpTy>
705 static void maybeAssignMask(OpBuilder &b, OpTy xferOp, OpTy newXferOp,
706                             int64_t i) {
707   if (!xferOp.mask())
708     return;
709 
710   if (xferOp.isBroadcastDim(0)) {
711     // To-be-unpacked dimension is a broadcast, which does not have a
712     // corresponding mask dimension. Mask attribute remains unchanged.
713     newXferOp.maskMutable().assign(xferOp.mask());
714     return;
715   }
716 
717   if (xferOp.getMaskType().getRank() > 1) {
718     // Unpack one dimension of the mask.
719     OpBuilder::InsertionGuard guard(b);
720     b.setInsertionPoint(newXferOp); // Insert load before newXfer.
721 
722     llvm::SmallVector<int64_t, 1> indices({i});
723     Location loc = xferOp.getLoc();
724     auto newMask = b.create<vector::ExtractOp>(loc, xferOp.mask(), indices);
725     newXferOp.maskMutable().assign(newMask);
726   }
727 
728   // If we end up here: The mask of the old transfer op is 1D and the unpacked
729   // dim is not a broadcast, so no mask is needed on the new transfer op.
730   // `generateInBoundsCheck` will have evaluated the mask already.
731 }
732 
733 /// Progressive lowering of vector TransferReadOp with unrolling: Unpack one
734 /// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no
735 /// memref buffer is allocated and the SCF loop is fully unrolled.
736 ///
737 /// ```
738 /// E.g.:
739 /// ```
740 /// %vec = vector.transfer_read %A[%a, %b, %c], %padding
741 ///     : memref<?x?x?xf32>, vector<5x4xf32>
742 /// ```
743 /// is rewritten to IR such as (simplified):
744 /// ```
745 /// %v_init = splat %padding : vector<5x4xf32>
746 /// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding
747 ///     : memref<?x?x?xf32>, vector<4xf32>
748 /// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32>
749 /// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding
750 ///     : memref<?x?x?xf32>, vector<4xf32>
751 /// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32>
752 /// ...
753 /// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding
754 ///     : memref<?x?x?xf32>, vector<4xf32>
755 /// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32>
756 /// ```
757 ///
758 /// Note: As an optimization, if the result of the original TransferReadOp
759 /// was directly inserted into another vector, no new %v_init vector is created.
760 /// Instead, the new TransferReadOp results are inserted into that vector.
761 struct UnrollTransferReadConversion
762     : public VectorToSCFPattern<TransferReadOp> {
763   using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern;
764 
765   /// Return the vector into which the newly created TransferReadOp results
766   /// are inserted.
767   Value getResultVector(TransferReadOp xferOp,
768                         PatternRewriter &rewriter) const {
769     if (auto insertOp = getInsertOp(xferOp))
770       return insertOp.dest();
771     Location loc = xferOp.getLoc();
772     return rewriter.create<SplatOp>(loc, xferOp.getVectorType(),
773                                     xferOp.padding());
774   }
775 
776   /// If the result of the TransferReadOp has exactly one user, which is a
777   /// vector::InsertOp, return that operation.
778   vector::InsertOp getInsertOp(TransferReadOp xferOp) const {
779     if (xferOp->hasOneUse()) {
780       Operation *xferOpUser = *xferOp->getUsers().begin();
781       if (auto insertOp = dyn_cast<vector::InsertOp>(xferOpUser))
782         return insertOp;
783     }
784 
785     return vector::InsertOp();
786   }
787 
788   /// If the result of the TransferReadOp has exactly one user, which is a
789   /// vector::InsertOp, return that operation's indices.
790   void getInsertionIndices(TransferReadOp xferOp,
791                            SmallVector<int64_t, 8> &indices) const {
792     if (auto insertOp = getInsertOp(xferOp)) {
793       llvm::for_each(insertOp.position(), [&](Attribute attr) {
794         indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
795       });
796     }
797   }
798 
799   /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
800   /// accesses, and broadcasts and transposes in permutation maps.
801   LogicalResult matchAndRewrite(TransferReadOp xferOp,
802                                 PatternRewriter &rewriter) const override {
803     if (xferOp.getVectorType().getRank() <= options.targetRank)
804       return failure();
805 
806     auto insertOp = getInsertOp(xferOp);
807     auto vec = getResultVector(xferOp, rewriter);
808     auto vecType = vec.getType().dyn_cast<VectorType>();
809     auto xferVecType = xferOp.getVectorType();
810     auto newXferVecType = VectorType::get(xferVecType.getShape().drop_front(),
811                                           xferVecType.getElementType());
812     int64_t dimSize = xferVecType.getShape()[0];
813 
814     // Generate fully unrolled loop of transfer ops.
815     Location loc = xferOp.getLoc();
816     for (int64_t i = 0; i < dimSize; ++i) {
817       Value iv = rewriter.create<ConstantIndexOp>(loc, i);
818 
819       vec = generateInBoundsCheck(
820           rewriter, xferOp, iv, unpackedDim(xferOp), TypeRange(vecType),
821           /*inBoundsCase=*/
822           [&](OpBuilder &b, Location loc) {
823             // Indices for the new transfer op.
824             SmallVector<Value, 8> xferIndices;
825             getXferIndices(b, xferOp, iv, xferIndices);
826 
827             // Indices for the new vector.insert op.
828             SmallVector<int64_t, 8> insertionIndices;
829             getInsertionIndices(xferOp, insertionIndices);
830             insertionIndices.push_back(i);
831 
832             auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
833             auto newXferOp = b.create<vector::TransferReadOp>(
834                 loc, newXferVecType, xferOp.source(), xferIndices,
835                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
836                 xferOp.padding(), Value(), inBoundsAttr);
837             maybeAssignMask(b, xferOp, newXferOp, i);
838             return b.create<vector::InsertOp>(loc, newXferOp, vec,
839                                               insertionIndices);
840           },
841           /*outOfBoundsCase=*/
842           [&](OpBuilder &b, Location loc) {
843             // Loop through original (unmodified) vector.
844             return vec;
845           });
846     }
847 
848     if (insertOp) {
849       // Rewrite single user of the old TransferReadOp, which was an InsertOp.
850       rewriter.replaceOp(insertOp, vec);
851       rewriter.eraseOp(xferOp);
852     } else {
853       rewriter.replaceOp(xferOp, vec);
854     }
855 
856     return success();
857   }
858 };
859 
860 /// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one
861 /// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no
862 /// memref buffer is allocated and the SCF loop is fully unrolled.
863 ///
864 /// ```
865 /// E.g.:
866 /// ```
867 /// vector.transfer_write %vec, %A[%a, %b, %c]
868 ///     : vector<5x4xf32>, memref<?x?x?xf32>
869 /// ```
870 /// is rewritten to IR such as (simplified):
871 /// ```
872 /// %v0 = vector.extract %vec[0] : vector<5x4xf32>
873 /// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...>
874 /// %v1 = vector.extract %vec[1] : vector<5x4xf32>
875 /// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...>
876 /// ...
877 /// %v4 = vector.extract %vec[4] : vector<5x4xf32>
878 /// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...>
879 /// ```
880 ///
881 /// Note: As an optimization, if the vector of the original TransferWriteOp
882 /// was directly extracted from another vector via an ExtractOp `a`, extract
883 /// the vectors for the newly generated TransferWriteOps from `a`'s input. By
884 /// doing so, `a` may become dead, and the number of ExtractOps generated during
885 /// recursive application of this pattern will be minimal.
886 struct UnrollTransferWriteConversion
887     : public VectorToSCFPattern<TransferWriteOp> {
888   using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern;
889 
890   /// Return the vector from which newly generated ExtracOps will extract.
891   Value getDataVector(TransferWriteOp xferOp) const {
892     if (auto extractOp = getExtractOp(xferOp))
893       return extractOp.vector();
894     return xferOp.vector();
895   }
896 
897   /// If the input of the given TransferWriteOp is an ExtractOp, return it.
898   vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const {
899     if (auto *op = xferOp.vector().getDefiningOp())
900       return dyn_cast<vector::ExtractOp>(op);
901     return vector::ExtractOp();
902   }
903 
904   /// If the input of the given TransferWriteOp is an ExtractOp, return its
905   /// indices.
906   void getExtractionIndices(TransferWriteOp xferOp,
907                             SmallVector<int64_t, 8> &indices) const {
908     if (auto extractOp = getExtractOp(xferOp)) {
909       llvm::for_each(extractOp.position(), [&](Attribute attr) {
910         indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
911       });
912     }
913   }
914 
915   /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
916   /// accesses, and broadcasts and transposes in permutation maps.
917   LogicalResult matchAndRewrite(TransferWriteOp xferOp,
918                                 PatternRewriter &rewriter) const override {
919     if (xferOp.getVectorType().getRank() <= options.targetRank)
920       return failure();
921 
922     auto vec = getDataVector(xferOp);
923     auto xferVecType = xferOp.getVectorType();
924     int64_t dimSize = xferVecType.getShape()[0];
925 
926     // Generate fully unrolled loop of transfer ops.
927     Location loc = xferOp.getLoc();
928     for (int64_t i = 0; i < dimSize; ++i) {
929       Value iv = rewriter.create<ConstantIndexOp>(loc, i);
930 
931       generateInBoundsCheck(
932           rewriter, xferOp, iv, unpackedDim(xferOp),
933           /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
934             // Indices for the new transfer op.
935             SmallVector<Value, 8> xferIndices;
936             getXferIndices(b, xferOp, iv, xferIndices);
937 
938             // Indices for the new vector.extract op.
939             SmallVector<int64_t, 8> extractionIndices;
940             getExtractionIndices(xferOp, extractionIndices);
941             extractionIndices.push_back(i);
942 
943             auto extracted =
944                 b.create<vector::ExtractOp>(loc, vec, extractionIndices);
945             auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
946 
947             auto newXferOp = b.create<vector::TransferWriteOp>(
948                 loc, Type(), extracted, xferOp.source(), xferIndices,
949                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
950                 inBoundsAttr);
951 
952             maybeAssignMask(b, xferOp, newXferOp, i);
953           });
954     }
955 
956     rewriter.eraseOp(xferOp);
957     return success();
958   }
959 };
960 
961 } // namespace lowering_n_d_unrolled
962 
963 namespace lowering_1_d {
964 
965 /// Compute the indices into the memref for the LoadOp/StoreOp generated as
966 /// part of TransferOp1dConversion. Return the memref dimension on which
967 /// the transfer is operating. A return value of None indicates a broadcast.
968 template <typename OpTy>
969 static Optional<int64_t>
970 get1dMemrefIndices(OpBuilder &b, OpTy xferOp, Value iv,
971                    SmallVector<Value, 8> &memrefIndices) {
972   auto indices = xferOp.indices();
973   auto map = xferOp.permutation_map();
974 
975   memrefIndices.append(indices.begin(), indices.end());
976   assert(map.getNumResults() == 1 &&
977          "Expected 1 permutation map result for 1D transfer");
978   if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
979     Location loc = xferOp.getLoc();
980     auto dim = expr.getPosition();
981     AffineExpr d0, d1;
982     bindDims(xferOp.getContext(), d0, d1);
983     Value offset = memrefIndices[dim];
984     memrefIndices[dim] = makeComposedAffineApply(b, loc, d0 + d1, {offset, iv});
985     return dim;
986   }
987 
988   assert(xferOp.isBroadcastDim(0) &&
989          "Expected AffineDimExpr or AffineConstantExpr");
990   return None;
991 }
992 
993 /// Codegen strategy for TransferOp1dConversion, depending on the
994 /// operation.
995 template <typename OpTy>
996 struct Strategy1d;
997 
998 /// Codegen strategy for TransferReadOp.
999 template <>
1000 struct Strategy1d<TransferReadOp> {
1001   static void generateForLoopBody(OpBuilder &b, Location loc,
1002                                   TransferReadOp xferOp, Value iv,
1003                                   ValueRange loopState) {
1004     SmallVector<Value, 8> indices;
1005     auto dim = get1dMemrefIndices(b, xferOp, iv, indices);
1006     Value ivI32 =
1007         b.create<IndexCastOp>(loc, IntegerType::get(b.getContext(), 32), iv);
1008     auto vec = loopState[0];
1009 
1010     // In case of out-of-bounds access, leave `vec` as is (was initialized with
1011     // padding value).
1012     auto nextVec = generateInBoundsCheck(
1013         b, xferOp, iv, dim, TypeRange(xferOp.getVectorType()),
1014         /*inBoundsCase=*/
1015         [&](OpBuilder &b, Location loc) {
1016           Value val = b.create<memref::LoadOp>(loc, xferOp.source(), indices);
1017           return b.create<vector::InsertElementOp>(loc, val, vec, ivI32);
1018         },
1019         /*outOfBoundsCase=*/
1020         [&](OpBuilder & /*b*/, Location loc) { return vec; });
1021     b.create<scf::YieldOp>(loc, nextVec);
1022   }
1023 
1024   static Value initialLoopState(OpBuilder &b, TransferReadOp xferOp) {
1025     // Inititalize vector with padding value.
1026     Location loc = xferOp.getLoc();
1027     return b.create<SplatOp>(loc, xferOp.getVectorType(), xferOp.padding());
1028   }
1029 };
1030 
1031 /// Codegen strategy for TransferWriteOp.
1032 template <>
1033 struct Strategy1d<TransferWriteOp> {
1034   static void generateForLoopBody(OpBuilder &b, Location loc,
1035                                   TransferWriteOp xferOp, Value iv,
1036                                   ValueRange /*loopState*/) {
1037     SmallVector<Value, 8> indices;
1038     auto dim = get1dMemrefIndices(b, xferOp, iv, indices);
1039     Value ivI32 =
1040         b.create<IndexCastOp>(loc, IntegerType::get(b.getContext(), 32), iv);
1041 
1042     // Nothing to do in case of out-of-bounds access.
1043     generateInBoundsCheck(
1044         b, xferOp, iv, dim,
1045         /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
1046           auto val =
1047               b.create<vector::ExtractElementOp>(loc, xferOp.vector(), ivI32);
1048           b.create<memref::StoreOp>(loc, val, xferOp.source(), indices);
1049         });
1050     b.create<scf::YieldOp>(loc);
1051   }
1052 
1053   static Value initialLoopState(OpBuilder &b, TransferWriteOp xferOp) {
1054     return Value();
1055   }
1056 };
1057 
1058 /// Return true if the last dimension of the MemRefType has unit stride.
1059 static bool isLastMemrefDimUnitStride(MemRefType type) {
1060   int64_t offset;
1061   SmallVector<int64_t, 4> strides;
1062   auto successStrides = getStridesAndOffset(type, strides, offset);
1063   return succeeded(successStrides) && strides.back() == 1;
1064 }
1065 
1066 /// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is
1067 /// necessary in cases where a 1D vector transfer op cannot be lowered into
1068 /// vector load/stores due to non-unit strides or broadcasts:
1069 ///
1070 /// * Transfer dimension is not the last memref dimension
1071 /// * Transfer dimension is a broadcast (i.e., scalar load + broadcast)
1072 /// * Memref has a layout map with non-unit stride on the last dimension
1073 ///
1074 /// This pattern generates IR as follows:
1075 ///
1076 /// 1. Generate a for loop iterating over each vector element.
1077 /// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp,
1078 ///    depending on OpTy.
1079 ///
1080 /// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp
1081 ///       can be generated instead of TransferOp1dConversion. Add such a pattern
1082 ///       to ConvertVectorToLLVM.
1083 ///
1084 /// E.g.:
1085 /// ```
1086 /// vector.transfer_write %vec, %A[%a, %b]
1087 ///    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
1088 ///    : vector<9xf32>, memref<?x?xf32>
1089 /// ```
1090 /// Is rewritten to approximately the following pseudo-IR:
1091 /// ```
1092 /// for i = 0 to 9 {
1093 ///   %t = vector.extractelement %vec[i] : vector<9xf32>
1094 ///   memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32>
1095 /// }
1096 /// ```
1097 template <typename OpTy>
1098 struct TransferOp1dConversion : public VectorToSCFPattern<OpTy> {
1099   using VectorToSCFPattern<OpTy>::VectorToSCFPattern;
1100 
1101   LogicalResult matchAndRewrite(OpTy xferOp,
1102                                 PatternRewriter &rewriter) const override {
1103     auto map = xferOp.permutation_map();
1104     auto memRefType = xferOp.getShapedType().template dyn_cast<MemRefType>();
1105 
1106     if (!memRefType)
1107       return failure();
1108     if (xferOp.getVectorType().getRank() != 1)
1109       return failure();
1110     if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType))
1111       return failure(); // Handled by ConvertVectorToLLVM
1112 
1113     // Loop bounds, step, state...
1114     Location loc = xferOp.getLoc();
1115     auto vecType = xferOp.getVectorType();
1116     auto lb = rewriter.create<ConstantIndexOp>(loc, 0);
1117     auto ub = rewriter.create<ConstantIndexOp>(loc, vecType.getDimSize(0));
1118     auto step = rewriter.create<ConstantIndexOp>(loc, 1);
1119     auto loopState = Strategy1d<OpTy>::initialLoopState(rewriter, xferOp);
1120 
1121     // Generate for loop.
1122     rewriter.replaceOpWithNewOp<scf::ForOp>(
1123         xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
1124         [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) {
1125           Strategy1d<OpTy>::generateForLoopBody(b, loc, xferOp, iv, loopState);
1126         });
1127 
1128     return success();
1129   }
1130 };
1131 
1132 } // namespace lowering_1_d
1133 } // namespace
1134 
1135 namespace mlir {
1136 
1137 void populateVectorToSCFConversionPatterns(
1138     RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) {
1139   if (options.unroll) {
1140     patterns.add<lowering_n_d_unrolled::UnrollTransferReadConversion,
1141                  lowering_n_d_unrolled::UnrollTransferWriteConversion>(
1142         patterns.getContext(), options);
1143   } else {
1144     patterns.add<lowering_n_d::PrepareTransferReadConversion,
1145                  lowering_n_d::PrepareTransferWriteConversion,
1146                  lowering_n_d::TransferOpConversion<TransferReadOp>,
1147                  lowering_n_d::TransferOpConversion<TransferWriteOp>>(
1148         patterns.getContext(), options);
1149   }
1150 
1151   if (options.targetRank == 1) {
1152     patterns.add<lowering_1_d::TransferOp1dConversion<TransferReadOp>,
1153                  lowering_1_d::TransferOp1dConversion<TransferWriteOp>>(
1154         patterns.getContext(), options);
1155   }
1156 }
1157 
1158 } // namespace mlir
1159 
1160 namespace {
1161 
1162 struct ConvertVectorToSCFPass
1163     : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
1164   ConvertVectorToSCFPass() = default;
1165   ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
1166     this->fullUnroll = options.unroll;
1167     this->targetRank = options.targetRank;
1168     this->lowerPermutationMaps = options.lowerPermutationMaps;
1169   }
1170 
1171   void runOnFunction() override {
1172     VectorTransferToSCFOptions options;
1173     options.unroll = fullUnroll;
1174     options.targetRank = targetRank;
1175     options.lowerPermutationMaps = lowerPermutationMaps;
1176 
1177     // Lower permutation maps first.
1178     if (lowerPermutationMaps) {
1179       RewritePatternSet lowerTransferPatterns(getFunction().getContext());
1180       mlir::vector::populateVectorTransferPermutationMapLoweringPatterns(
1181           lowerTransferPatterns);
1182       (void)applyPatternsAndFoldGreedily(getFunction(),
1183                                          std::move(lowerTransferPatterns));
1184     }
1185 
1186     RewritePatternSet patterns(getFunction().getContext());
1187     populateVectorToSCFConversionPatterns(patterns, options);
1188     (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
1189   }
1190 };
1191 
1192 } // namespace
1193 
1194 std::unique_ptr<Pass>
1195 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
1196   return std::make_unique<ConvertVectorToSCFPass>(options);
1197 }
1198