1 //===- SuperVectorize.cpp - Vectorize Pass Impl ---------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements vectorization of loops, operations and data types to
10 // a target-independent, n-D super-vector abstraction.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "mlir/Analysis/LoopAnalysis.h"
15 #include "mlir/Analysis/NestedMatcher.h"
16 #include "mlir/Analysis/SliceAnalysis.h"
17 #include "mlir/Analysis/Utils.h"
18 #include "mlir/Dialect/Affine/IR/AffineOps.h"
19 #include "mlir/Dialect/Affine/Passes.h"
20 #include "mlir/Dialect/StandardOps/IR/Ops.h"
21 #include "mlir/Dialect/Vector/VectorOps.h"
22 #include "mlir/Dialect/Vector/VectorUtils.h"
23 #include "mlir/IR/AffineExpr.h"
24 #include "mlir/IR/Builders.h"
25 #include "mlir/IR/Location.h"
26 #include "mlir/IR/Types.h"
27 #include "mlir/Pass/Pass.h"
28 #include "mlir/Support/Functional.h"
29 #include "mlir/Support/LLVM.h"
30 #include "mlir/Transforms/FoldUtils.h"
31 
32 #include "llvm/ADT/DenseMap.h"
33 #include "llvm/ADT/DenseSet.h"
34 #include "llvm/ADT/SetVector.h"
35 #include "llvm/ADT/SmallString.h"
36 #include "llvm/ADT/SmallVector.h"
37 #include "llvm/Support/CommandLine.h"
38 #include "llvm/Support/Debug.h"
39 
40 using namespace mlir;
41 
42 ///
43 /// Implements a high-level vectorization strategy on a Function.
44 /// The abstraction used is that of super-vectors, which provide a single,
45 /// compact, representation in the vector types, information that is expected
46 /// to reduce the impact of the phase ordering problem
47 ///
48 /// Vector granularity:
49 /// ===================
50 /// This pass is designed to perform vectorization at a super-vector
51 /// granularity. A super-vector is loosely defined as a vector type that is a
52 /// multiple of a "good" vector size so the HW can efficiently implement a set
53 /// of high-level primitives. Multiple is understood along any dimension; e.g.
54 /// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a
55 /// vector<8xf32> HW vector. Note that a "good vector size so the HW can
56 /// efficiently implement a set of high-level primitives" is not necessarily an
57 /// integer multiple of actual hardware registers. We leave details of this
58 /// distinction unspecified for now.
59 ///
60 /// Some may prefer the terminology a "tile of HW vectors". In this case, one
61 /// should note that super-vectors implement an "always full tile" abstraction.
62 /// They guarantee no partial-tile separation is necessary by relying on a
63 /// high-level copy-reshape abstraction that we call vector.transfer. This
64 /// copy-reshape operations is also responsible for performing layout
65 /// transposition if necessary. In the general case this will require a scoped
66 /// allocation in some notional local memory.
67 ///
68 /// Whatever the mental model one prefers to use for this abstraction, the key
69 /// point is that we burn into a single, compact, representation in the vector
70 /// types, information that is expected to reduce the impact of the phase
71 /// ordering problem. Indeed, a vector type conveys information that:
72 ///   1. the associated loops have dependency semantics that do not prevent
73 ///      vectorization;
74 ///   2. the associate loops have been sliced in chunks of static sizes that are
75 ///      compatible with vector sizes (i.e. similar to unroll-and-jam);
76 ///   3. the inner loops, in the unroll-and-jam analogy of 2, are captured by
77 ///   the
78 ///      vector type and no vectorization hampering transformations can be
79 ///      applied to them anymore;
80 ///   4. the underlying memrefs are accessed in some notional contiguous way
81 ///      that allows loading into vectors with some amount of spatial locality;
82 /// In other words, super-vectorization provides a level of separation of
83 /// concern by way of opacity to subsequent passes. This has the effect of
84 /// encapsulating and propagating vectorization constraints down the list of
85 /// passes until we are ready to lower further.
86 ///
87 /// For a particular target, a notion of minimal n-d vector size will be
88 /// specified and vectorization targets a multiple of those. In the following
89 /// paragraph, let "k ." represent "a multiple of", to be understood as a
90 /// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes
91 /// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc).
92 ///
93 /// Some non-exhaustive notable super-vector sizes of interest include:
94 ///   - CPU: vector<k . HW_vector_size>,
95 ///          vector<k' . core_count x k . HW_vector_size>,
96 ///          vector<socket_count x k' . core_count x k . HW_vector_size>;
97 ///   - GPU: vector<k . warp_size>,
98 ///          vector<k . warp_size x float2>,
99 ///          vector<k . warp_size x float4>,
100 ///          vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes).
101 ///
102 /// Loops and operations are emitted that operate on those super-vector shapes.
103 /// Subsequent lowering passes will materialize to actual HW vector sizes. These
104 /// passes are expected to be (gradually) more target-specific.
105 ///
106 /// At a high level, a vectorized load in a loop will resemble:
107 /// ```mlir
108 ///   affine.for %i = ? to ? step ? {
109 ///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
110 ///   }
111 /// ```
112 /// It is the responsibility of the implementation of vector.transfer_read to
113 /// materialize vector registers from the original scalar memrefs. A later (more
114 /// target-dependent) lowering pass will materialize to actual HW vector sizes.
115 /// This lowering may be occur at different times:
116 ///   1. at the MLIR level into a combination of loops, unrolling, DmaStartOp +
117 ///      DmaWaitOp + vectorized operations for data transformations and shuffle;
118 ///      thus opening opportunities for unrolling and pipelining. This is an
119 ///      instance of library call "whiteboxing"; or
120 ///   2. later in the a target-specific lowering pass or hand-written library
121 ///      call; achieving full separation of concerns. This is an instance of
122 ///      library call; or
123 ///   3. a mix of both, e.g. based on a model.
124 /// In the future, these operations will expose a contract to constrain the
125 /// search on vectorization patterns and sizes.
126 ///
127 /// Occurrence of super-vectorization in the compiler flow:
128 /// =======================================================
129 /// This is an active area of investigation. We start with 2 remarks to position
130 /// super-vectorization in the context of existing ongoing work: LLVM VPLAN
131 /// and LLVM SLP Vectorizer.
132 ///
133 /// LLVM VPLAN:
134 /// -----------
135 /// The astute reader may have noticed that in the limit, super-vectorization
136 /// can be applied at a similar time and with similar objectives than VPLAN.
137 /// For instance, in the case of a traditional, polyhedral compilation-flow (for
138 /// instance, the PPCG project uses ISL to provide dependence analysis,
139 /// multi-level(scheduling + tiling), lifting footprint to fast memory,
140 /// communication synthesis, mapping, register optimizations) and before
141 /// unrolling. When vectorization is applied at this *late* level in a typical
142 /// polyhedral flow, and is instantiated with actual hardware vector sizes,
143 /// super-vectorization is expected to match (or subsume) the type of patterns
144 /// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR
145 /// is higher level and our implementation should be significantly simpler. Also
146 /// note that in this mode, recursive patterns are probably a bit of an overkill
147 /// although it is reasonable to expect that mixing a bit of outer loop and
148 /// inner loop vectorization + unrolling will provide interesting choices to
149 /// MLIR.
150 ///
151 /// LLVM SLP Vectorizer:
152 /// --------------------
153 /// Super-vectorization however is not meant to be usable in a similar fashion
154 /// to the SLP vectorizer. The main difference lies in the information that
155 /// both vectorizers use: super-vectorization examines contiguity of memory
156 /// references along fastest varying dimensions and loops with recursive nested
157 /// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on
158 /// the other hand, performs flat pattern matching inside a single unrolled loop
159 /// body and stitches together pieces of load and store operations into full
160 /// 1-D vectors. We envision that the SLP vectorizer is a good way to capture
161 /// innermost loop, control-flow dependent patterns that super-vectorization may
162 /// not be able to capture easily. In other words, super-vectorization does not
163 /// aim at replacing the SLP vectorizer and the two solutions are complementary.
164 ///
165 /// Ongoing investigations:
166 /// -----------------------
167 /// We discuss the following *early* places where super-vectorization is
168 /// applicable and touch on the expected benefits and risks . We list the
169 /// opportunities in the context of the traditional polyhedral compiler flow
170 /// described in PPCG. There are essentially 6 places in the MLIR pass pipeline
171 /// we expect to experiment with super-vectorization:
172 /// 1. Right after language lowering to MLIR: this is the earliest time where
173 ///    super-vectorization is expected to be applied. At this level, all the
174 ///    language/user/library-level annotations are available and can be fully
175 ///    exploited. Examples include loop-type annotations (such as parallel,
176 ///    reduction, scan, dependence distance vector, vectorizable) as well as
177 ///    memory access annotations (such as non-aliasing writes guaranteed,
178 ///    indirect accesses that are permutations by construction) accesses or
179 ///    that a particular operation is prescribed atomic by the user. At this
180 ///    level, anything that enriches what dependence analysis can do should be
181 ///    aggressively exploited. At this level we are close to having explicit
182 ///    vector types in the language, except we do not impose that burden on the
183 ///    programmer/library: we derive information from scalar code + annotations.
184 /// 2. After dependence analysis and before polyhedral scheduling: the
185 ///    information that supports vectorization does not need to be supplied by a
186 ///    higher level of abstraction. Traditional dependence analysis is available
187 ///    in MLIR and will be used to drive vectorization and cost models.
188 ///
189 /// Let's pause here and remark that applying super-vectorization as described
190 /// in 1. and 2. presents clear opportunities and risks:
191 ///   - the opportunity is that vectorization is burned in the type system and
192 ///   is protected from the adverse effect of loop scheduling, tiling, loop
193 ///   interchange and all passes downstream. Provided that subsequent passes are
194 ///   able to operate on vector types; the vector shapes, associated loop
195 ///   iterator properties, alignment, and contiguity of fastest varying
196 ///   dimensions are preserved until we lower the super-vector types. We expect
197 ///   this to significantly rein in on the adverse effects of phase ordering.
198 ///   - the risks are that a. all passes after super-vectorization have to work
199 ///   on elemental vector types (not that this is always true, wherever
200 ///   vectorization is applied) and b. that imposing vectorization constraints
201 ///   too early may be overall detrimental to loop fusion, tiling and other
202 ///   transformations because the dependence distances are coarsened when
203 ///   operating on elemental vector types. For this reason, the pattern
204 ///   profitability analysis should include a component that also captures the
205 ///   maximal amount of fusion available under a particular pattern. This is
206 ///   still at the stage of rough ideas but in this context, search is our
207 ///   friend as the Tensor Comprehensions and auto-TVM contributions
208 ///   demonstrated previously.
209 /// Bottom-line is we do not yet have good answers for the above but aim at
210 /// making it easy to answer such questions.
211 ///
212 /// Back to our listing, the last places where early super-vectorization makes
213 /// sense are:
214 /// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known
215 ///    to improve locality, parallelism and be configurable (e.g. max-fuse,
216 ///    smart-fuse etc). They can also have adverse effects on contiguity
217 ///    properties that are required for vectorization but the vector.transfer
218 ///    copy-reshape-pad-transpose abstraction is expected to help recapture
219 ///    these properties.
220 /// 4. right after polyhedral-style scheduling+tiling;
221 /// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent
222 ///    probably the most promising places because applying tiling achieves a
223 ///    separation of concerns that allows rescheduling to worry less about
224 ///    locality and more about parallelism and distribution (e.g. min-fuse).
225 ///
226 /// At these levels the risk-reward looks different: on one hand we probably
227 /// lost a good deal of language/user/library-level annotation; on the other
228 /// hand we gained parallelism and locality through scheduling and tiling.
229 /// However we probably want to ensure tiling is compatible with the
230 /// full-tile-only abstraction used in super-vectorization or suffer the
231 /// consequences. It is too early to place bets on what will win but we expect
232 /// super-vectorization to be the right abstraction to allow exploring at all
233 /// these levels. And again, search is our friend.
234 ///
235 /// Lastly, we mention it again here:
236 /// 6. as a MLIR-based alternative to VPLAN.
237 ///
238 /// Lowering, unrolling, pipelining:
239 /// ================================
240 /// TODO(ntv): point to the proper places.
241 ///
242 /// Algorithm:
243 /// ==========
244 /// The algorithm proceeds in a few steps:
245 ///  1. defining super-vectorization patterns and matching them on the tree of
246 ///     AffineForOp. A super-vectorization pattern is defined as a recursive
247 ///     data structures that matches and captures nested, imperfectly-nested
248 ///     loops that have a. conformable loop annotations attached (e.g. parallel,
249 ///     reduction, vectorizable, ...) as well as b. all contiguous load/store
250 ///     operations along a specified minor dimension (not necessarily the
251 ///     fastest varying) ;
252 ///  2. analyzing those patterns for profitability (TODO(ntv): and
253 ///     interference);
254 ///  3. Then, for each pattern in order:
255 ///    a. applying iterative rewriting of the loop and the load operations in
256 ///       DFS postorder. Rewriting is implemented by coarsening the loops and
257 ///       turning load operations into opaque vector.transfer_read ops;
258 ///    b. keeping track of the load operations encountered as "roots" and the
259 ///       store operations as "terminals";
260 ///    c. traversing the use-def chains starting from the roots and iteratively
261 ///       propagating vectorized values. Scalar values that are encountered
262 ///       during this process must come from outside the scope of the current
263 ///       pattern (TODO(ntv): enforce this and generalize). Such a scalar value
264 ///       is vectorized only if it is a constant (into a vector splat). The
265 ///       non-constant case is not supported for now and results in the pattern
266 ///       failing to vectorize;
267 ///    d. performing a second traversal on the terminals (store ops) to
268 ///       rewriting the scalar value they write to memory into vector form.
269 ///       If the scalar value has been vectorized previously, we simply replace
270 ///       it by its vector form. Otherwise, if the scalar value is a constant,
271 ///       it is vectorized into a splat. In all other cases, vectorization for
272 ///       the pattern currently fails.
273 ///    e. if everything under the root AffineForOp in the current pattern
274 ///       vectorizes properly, we commit that loop to the IR. Otherwise we
275 ///       discard it and restore a previously cloned version of the loop. Thanks
276 ///       to the recursive scoping nature of matchers and captured patterns,
277 ///       this is transparently achieved by a simple RAII implementation.
278 ///    f. vectorization is applied on the next pattern in the list. Because
279 ///       pattern interference avoidance is not yet implemented and that we do
280 ///       not support further vectorizing an already vector load we need to
281 ///       re-verify that the pattern is still vectorizable. This is expected to
282 ///       make cost models more difficult to write and is subject to improvement
283 ///       in the future.
284 ///
285 /// Points c. and d. above are worth additional comment. In most passes that
286 /// do not change the type of operands, it is usually preferred to eagerly
287 /// `replaceAllUsesWith`. Unfortunately this does not work for vectorization
288 /// because during the use-def chain traversal, all the operands of an operation
289 /// must be available in vector form. Trying to propagate eagerly makes the IR
290 /// temporarily invalid and results in errors such as:
291 ///   `vectorize.mlir:308:13: error: 'addf' op requires the same type for all
292 ///   operands and results
293 ///      %s5 = addf %a5, %b5 : f32`
294 ///
295 /// Lastly, we show a minimal example for which use-def chains rooted in load /
296 /// vector.transfer_read are not enough. This is what motivated splitting
297 /// terminal processing out of the use-def chains starting from loads. In the
298 /// following snippet, there is simply no load::
299 /// ```mlir
300 /// func @fill(%A : memref<128xf32>) -> () {
301 ///   %f1 = constant 1.0 : f32
302 ///   affine.for %i0 = 0 to 32 {
303 ///     affine.store %f1, %A[%i0] : memref<128xf32, 0>
304 ///   }
305 ///   return
306 /// }
307 /// ```
308 ///
309 /// Choice of loop transformation to support the algorithm:
310 /// =======================================================
311 /// The choice of loop transformation to apply for coarsening vectorized loops
312 /// is still subject to exploratory tradeoffs. In particular, say we want to
313 /// vectorize by a factor 128, we want to transform the following input:
314 /// ```mlir
315 ///   affine.for %i = %M to %N {
316 ///     %a = affine.load %A[%i] : memref<?xf32>
317 ///   }
318 /// ```
319 ///
320 /// Traditionally, one would vectorize late (after scheduling, tiling,
321 /// memory promotion etc) say after stripmining (and potentially unrolling in
322 /// the case of LLVM's SLP vectorizer):
323 /// ```mlir
324 ///   affine.for %i = floor(%M, 128) to ceil(%N, 128) {
325 ///     affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
326 ///       %a = affine.load %A[%ii] : memref<?xf32>
327 ///     }
328 ///   }
329 /// ```
330 ///
331 /// Instead, we seek to vectorize early and freeze vector types before
332 /// scheduling, so we want to generate a pattern that resembles:
333 /// ```mlir
334 ///   affine.for %i = ? to ? step ? {
335 ///     %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
336 ///   }
337 /// ```
338 ///
339 /// i. simply dividing the lower / upper bounds by 128 creates issues
340 ///    when representing expressions such as ii + 1 because now we only
341 ///    have access to original values that have been divided. Additional
342 ///    information is needed to specify accesses at below-128 granularity;
343 /// ii. another alternative is to coarsen the loop step but this may have
344 ///    consequences on dependence analysis and fusability of loops: fusable
345 ///    loops probably need to have the same step (because we don't want to
346 ///    stripmine/unroll to enable fusion).
347 /// As a consequence, we choose to represent the coarsening using the loop
348 /// step for now and reevaluate in the future. Note that we can renormalize
349 /// loop steps later if/when we have evidence that they are problematic.
350 ///
351 /// For the simple strawman example above, vectorizing for a 1-D vector
352 /// abstraction of size 128 returns code similar to:
353 /// ```mlir
354 ///   affine.for %i = %M to %N step 128 {
355 ///     %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
356 ///   }
357 /// ```
358 ///
359 /// Unsupported cases, extensions, and work in progress (help welcome :-) ):
360 /// ========================================================================
361 ///   1. lowering to concrete vector types for various HW;
362 ///   2. reduction support;
363 ///   3. non-effecting padding during vector.transfer_read and filter during
364 ///      vector.transfer_write;
365 ///   4. misalignment support vector.transfer_read / vector.transfer_write
366 ///      (hopefully without read-modify-writes);
367 ///   5. control-flow support;
368 ///   6. cost-models, heuristics and search;
369 ///   7. Op implementation, extensions and implication on memref views;
370 ///   8. many TODOs left around.
371 ///
372 /// Examples:
373 /// =========
374 /// Consider the following Function:
375 /// ```mlir
376 /// func @vector_add_2d(%M : index, %N : index) -> f32 {
377 ///   %A = alloc (%M, %N) : memref<?x?xf32, 0>
378 ///   %B = alloc (%M, %N) : memref<?x?xf32, 0>
379 ///   %C = alloc (%M, %N) : memref<?x?xf32, 0>
380 ///   %f1 = constant 1.0 : f32
381 ///   %f2 = constant 2.0 : f32
382 ///   affine.for %i0 = 0 to %M {
383 ///     affine.for %i1 = 0 to %N {
384 ///       // non-scoped %f1
385 ///       affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
386 ///     }
387 ///   }
388 ///   affine.for %i2 = 0 to %M {
389 ///     affine.for %i3 = 0 to %N {
390 ///       // non-scoped %f2
391 ///       affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
392 ///     }
393 ///   }
394 ///   affine.for %i4 = 0 to %M {
395 ///     affine.for %i5 = 0 to %N {
396 ///       %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
397 ///       %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
398 ///       %s5 = addf %a5, %b5 : f32
399 ///       // non-scoped %f1
400 ///       %s6 = addf %s5, %f1 : f32
401 ///       // non-scoped %f2
402 ///       %s7 = addf %s5, %f2 : f32
403 ///       // diamond dependency.
404 ///       %s8 = addf %s7, %s6 : f32
405 ///       affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
406 ///     }
407 ///   }
408 ///   %c7 = constant 7 : index
409 ///   %c42 = constant 42 : index
410 ///   %res = load %C[%c7, %c42] : memref<?x?xf32, 0>
411 ///   return %res : f32
412 /// }
413 /// ```
414 ///
415 /// The -affine-vectorize pass with the following arguments:
416 /// ```
417 /// -affine-vectorize="virtual-vector-size=256 test-fastest-varying=0"
418 /// ```
419 ///
420 /// produces this standard innermost-loop vectorized code:
421 /// ```mlir
422 /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
423 ///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
424 ///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
425 ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
426 ///   %cst = constant 1.0 : f32
427 ///   %cst_0 = constant 2.0 : f32
428 ///   affine.for %i0 = 0 to %arg0 {
429 ///     affine.for %i1 = 0 to %arg1 step 256 {
430 ///       %cst_1 = constant dense<vector<256xf32>, 1.0> :
431 ///                vector<256xf32>
432 ///       vector.transfer_write %cst_1, %0[%i0, %i1] :
433 ///                vector<256xf32>, memref<?x?xf32>
434 ///     }
435 ///   }
436 ///   affine.for %i2 = 0 to %arg0 {
437 ///     affine.for %i3 = 0 to %arg1 step 256 {
438 ///       %cst_2 = constant dense<vector<256xf32>, 2.0> :
439 ///                vector<256xf32>
440 ///       vector.transfer_write %cst_2, %1[%i2, %i3] :
441 ///                vector<256xf32>, memref<?x?xf32>
442 ///     }
443 ///   }
444 ///   affine.for %i4 = 0 to %arg0 {
445 ///     affine.for %i5 = 0 to %arg1 step 256 {
446 ///       %3 = vector.transfer_read %0[%i4, %i5] :
447 ///            memref<?x?xf32>, vector<256xf32>
448 ///       %4 = vector.transfer_read %1[%i4, %i5] :
449 ///            memref<?x?xf32>, vector<256xf32>
450 ///       %5 = addf %3, %4 : vector<256xf32>
451 ///       %cst_3 = constant dense<vector<256xf32>, 1.0> :
452 ///                vector<256xf32>
453 ///       %6 = addf %5, %cst_3 : vector<256xf32>
454 ///       %cst_4 = constant dense<vector<256xf32>, 2.0> :
455 ///                vector<256xf32>
456 ///       %7 = addf %5, %cst_4 : vector<256xf32>
457 ///       %8 = addf %7, %6 : vector<256xf32>
458 ///       vector.transfer_write %8, %2[%i4, %i5] :
459 ///                vector<256xf32>, memref<?x?xf32>
460 ///     }
461 ///   }
462 ///   %c7 = constant 7 : index
463 ///   %c42 = constant 42 : index
464 ///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
465 ///   return %9 : f32
466 /// }
467 /// ```
468 ///
469 /// The -affine-vectorize pass with the following arguments:
470 /// ```
471 /// -affine-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0"
472 /// ```
473 ///
474 /// produces this more interesting mixed outer-innermost-loop vectorized code:
475 /// ```mlir
476 /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
477 ///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
478 ///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
479 ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
480 ///   %cst = constant 1.0 : f32
481 ///   %cst_0 = constant 2.0 : f32
482 ///   affine.for %i0 = 0 to %arg0 step 32 {
483 ///     affine.for %i1 = 0 to %arg1 step 256 {
484 ///       %cst_1 = constant dense<vector<32x256xf32>, 1.0> :
485 ///                vector<32x256xf32>
486 ///       vector.transfer_write %cst_1, %0[%i0, %i1] :
487 ///                vector<32x256xf32>, memref<?x?xf32>
488 ///     }
489 ///   }
490 ///   affine.for %i2 = 0 to %arg0 step 32 {
491 ///     affine.for %i3 = 0 to %arg1 step 256 {
492 ///       %cst_2 = constant dense<vector<32x256xf32>, 2.0> :
493 ///                vector<32x256xf32>
494 ///       vector.transfer_write %cst_2, %1[%i2, %i3] :
495 ///                vector<32x256xf32>, memref<?x?xf32>
496 ///     }
497 ///   }
498 ///   affine.for %i4 = 0 to %arg0 step 32 {
499 ///     affine.for %i5 = 0 to %arg1 step 256 {
500 ///       %3 = vector.transfer_read %0[%i4, %i5] :
501 ///                memref<?x?xf32> vector<32x256xf32>
502 ///       %4 = vector.transfer_read %1[%i4, %i5] :
503 ///                memref<?x?xf32>, vector<32x256xf32>
504 ///       %5 = addf %3, %4 : vector<32x256xf32>
505 ///       %cst_3 = constant dense<vector<32x256xf32>, 1.0> :
506 ///                vector<32x256xf32>
507 ///       %6 = addf %5, %cst_3 : vector<32x256xf32>
508 ///       %cst_4 = constant dense<vector<32x256xf32>, 2.0> :
509 ///                vector<32x256xf32>
510 ///       %7 = addf %5, %cst_4 : vector<32x256xf32>
511 ///       %8 = addf %7, %6 : vector<32x256xf32>
512 ///       vector.transfer_write %8, %2[%i4, %i5] :
513 ///                vector<32x256xf32>, memref<?x?xf32>
514 ///     }
515 ///   }
516 ///   %c7 = constant 7 : index
517 ///   %c42 = constant 42 : index
518 ///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
519 ///   return %9 : f32
520 /// }
521 /// ```
522 ///
523 /// Of course, much more intricate n-D imperfectly-nested patterns can be
524 /// vectorized too and specified in a fully declarative fashion.
525 
526 #define DEBUG_TYPE "early-vect"
527 
528 using functional::makePtrDynCaster;
529 using functional::map;
530 using llvm::dbgs;
531 using llvm::SetVector;
532 
533 /// Forward declaration.
534 static FilterFunctionType
535 isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
536                              int fastestVaryingMemRefDimension);
537 
538 /// Creates a vectorization pattern from the command line arguments.
539 /// Up to 3-D patterns are supported.
540 /// If the command line argument requests a pattern of higher order, returns an
541 /// empty pattern list which will conservatively result in no vectorization.
542 static std::vector<NestedPattern>
543 makePatterns(const DenseSet<Operation *> &parallelLoops, int vectorRank,
544              ArrayRef<int64_t> fastestVaryingPattern) {
545   using matcher::For;
546   int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
547   int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
548   int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
549   switch (vectorRank) {
550   case 1:
551     return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
552   case 2:
553     return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
554                 For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
555   case 3:
556     return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
557                 For(isVectorizableLoopPtrFactory(parallelLoops, d1),
558                     For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
559   default: {
560     return std::vector<NestedPattern>();
561   }
562   }
563 }
564 
565 static NestedPattern &vectorTransferPattern() {
566   static auto pattern = matcher::Op([](Operation &op) {
567     return isa<vector::TransferReadOp>(op) || isa<vector::TransferWriteOp>(op);
568   });
569   return pattern;
570 }
571 
572 namespace {
573 
574 /// Base state for the vectorize pass.
575 /// Command line arguments are preempted by non-empty pass arguments.
576 struct Vectorize : public FunctionPass<Vectorize> {
577   Vectorize() = default;
578   Vectorize(const Vectorize &) {}
579   Vectorize(ArrayRef<int64_t> virtualVectorSize);
580   void runOnFunction() override;
581 
582   /// The virtual vector size that we vectorize to.
583   ListOption<int64_t> vectorSizes{
584       *this, "virtual-vector-size",
585       llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
586       llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated};
587   /// Optionally, the fixed mapping from loop to fastest varying MemRef
588   /// dimension for all the MemRefs within a loop pattern:
589   ///   the index represents the loop depth, the value represents the k^th
590   ///   fastest varying memory dimension.
591   /// This is voluntarily restrictive and is meant to precisely target a
592   /// particular loop/op pair, for testing purposes.
593   ListOption<int64_t> fastestVaryingPattern{
594       *this, "test-fastest-varying",
595       llvm::cl::desc(
596           "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory"
597           " dimensions to match. See defaultPatterns in Vectorize.cpp for a"
598           " description and examples. This is used for testing purposes"),
599       llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated};
600 };
601 
602 } // end anonymous namespace
603 
604 Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) {
605   vectorSizes->assign(virtualVectorSize.begin(), virtualVectorSize.end());
606 }
607 
608 /////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
609 /////////
610 namespace {
611 
612 struct VectorizationStrategy {
613   SmallVector<int64_t, 8> vectorSizes;
614   DenseMap<Operation *, unsigned> loopToVectorDim;
615 };
616 
617 } // end anonymous namespace
618 
619 static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
620                                       unsigned patternDepth,
621                                       VectorizationStrategy *strategy) {
622   assert(patternDepth > depthInPattern &&
623          "patternDepth is greater than depthInPattern");
624   if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {
625     // Don't vectorize this loop
626     return;
627   }
628   strategy->loopToVectorDim[loop] =
629       strategy->vectorSizes.size() - (patternDepth - depthInPattern);
630 }
631 
632 /// Implements a simple strawman strategy for vectorization.
633 /// Given a matched pattern `matches` of depth `patternDepth`, this strategy
634 /// greedily assigns the fastest varying dimension ** of the vector ** to the
635 /// innermost loop in the pattern.
636 /// When coupled with a pattern that looks for the fastest varying dimension in
637 /// load/store MemRefs, this creates a generic vectorization strategy that works
638 /// for any loop in a hierarchy (outermost, innermost or intermediate).
639 ///
640 /// TODO(ntv): In the future we should additionally increase the power of the
641 /// profitability analysis along 3 directions:
642 ///   1. account for loop extents (both static and parametric + annotations);
643 ///   2. account for data layout permutations;
644 ///   3. account for impact of vectorization on maximal loop fusion.
645 /// Then we can quantify the above to build a cost model and search over
646 /// strategies.
647 static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches,
648                                           unsigned depthInPattern,
649                                           unsigned patternDepth,
650                                           VectorizationStrategy *strategy) {
651   for (auto m : matches) {
652     if (failed(analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1,
653                                     patternDepth, strategy))) {
654       return failure();
655     }
656     vectorizeLoopIfProfitable(m.getMatchedOperation(), depthInPattern,
657                               patternDepth, strategy);
658   }
659   return success();
660 }
661 
662 ///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate /////
663 
664 namespace {
665 
666 struct VectorizationState {
667   /// Adds an entry of pre/post vectorization operations in the state.
668   void registerReplacement(Operation *key, Operation *value);
669   /// When the current vectorization pattern is successful, this erases the
670   /// operations that were marked for erasure in the proper order and resets
671   /// the internal state for the next pattern.
672   void finishVectorizationPattern();
673 
674   // In-order tracking of original Operation that have been vectorized.
675   // Erase in reverse order.
676   SmallVector<Operation *, 16> toErase;
677   // Set of Operation that have been vectorized (the values in the
678   // vectorizationMap for hashed access). The vectorizedSet is used in
679   // particular to filter the operations that have already been vectorized by
680   // this pattern, when iterating over nested loops in this pattern.
681   DenseSet<Operation *> vectorizedSet;
682   // Map of old scalar Operation to new vectorized Operation.
683   DenseMap<Operation *, Operation *> vectorizationMap;
684   // Map of old scalar Value to new vectorized Value.
685   DenseMap<Value, Value> replacementMap;
686   // The strategy drives which loop to vectorize by which amount.
687   const VectorizationStrategy *strategy;
688   // Use-def roots. These represent the starting points for the worklist in the
689   // vectorizeNonTerminals function. They consist of the subset of load
690   // operations that have been vectorized. They can be retrieved from
691   // `vectorizationMap` but it is convenient to keep track of them in a separate
692   // data structure.
693   DenseSet<Operation *> roots;
694   // Terminal operations for the worklist in the vectorizeNonTerminals
695   // function. They consist of the subset of store operations that have been
696   // vectorized. They can be retrieved from `vectorizationMap` but it is
697   // convenient to keep track of them in a separate data structure. Since they
698   // do not necessarily belong to use-def chains starting from loads (e.g
699   // storing a constant), we need to handle them in a post-pass.
700   DenseSet<Operation *> terminals;
701   // Checks that the type of `op` is AffineStoreOp and adds it to the terminals
702   // set.
703   void registerTerminal(Operation *op);
704   // Folder used to factor out constant creation.
705   OperationFolder *folder;
706 
707 private:
708   void registerReplacement(Value key, Value value);
709 };
710 
711 } // end namespace
712 
713 void VectorizationState::registerReplacement(Operation *key, Operation *value) {
714   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: ");
715   LLVM_DEBUG(key->print(dbgs()));
716   LLVM_DEBUG(dbgs() << "  into  ");
717   LLVM_DEBUG(value->print(dbgs()));
718   assert(key->getNumResults() == 1 && "already registered");
719   assert(value->getNumResults() == 1 && "already registered");
720   assert(vectorizedSet.count(value) == 0 && "already registered");
721   assert(vectorizationMap.count(key) == 0 && "already registered");
722   toErase.push_back(key);
723   vectorizedSet.insert(value);
724   vectorizationMap.insert(std::make_pair(key, value));
725   registerReplacement(key->getResult(0), value->getResult(0));
726   if (isa<AffineLoadOp>(key)) {
727     assert(roots.count(key) == 0 && "root was already inserted previously");
728     roots.insert(key);
729   }
730 }
731 
732 void VectorizationState::registerTerminal(Operation *op) {
733   assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp");
734   assert(terminals.count(op) == 0 &&
735          "terminal was already inserted previously");
736   terminals.insert(op);
737 }
738 
739 void VectorizationState::finishVectorizationPattern() {
740   while (!toErase.empty()) {
741     auto *op = toErase.pop_back_val();
742     LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: ");
743     LLVM_DEBUG(op->print(dbgs()));
744     op->erase();
745   }
746 }
747 
748 void VectorizationState::registerReplacement(Value key, Value value) {
749   assert(replacementMap.count(key) == 0 && "replacement already registered");
750   replacementMap.insert(std::make_pair(key, value));
751 }
752 
753 // Apply 'map' with 'mapOperands' returning resulting values in 'results'.
754 static void computeMemoryOpIndices(Operation *op, AffineMap map,
755                                    ValueRange mapOperands,
756                                    SmallVectorImpl<Value> &results) {
757   OpBuilder builder(op);
758   for (auto resultExpr : map.getResults()) {
759     auto singleResMap =
760         AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
761     auto afOp =
762         builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands);
763     results.push_back(afOp);
764   }
765 }
766 
767 ////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ////
768 
769 /// Handles the vectorization of load and store MLIR operations.
770 ///
771 /// AffineLoadOp operations are the roots of the vectorizeNonTerminals call.
772 /// They are vectorized immediately. The resulting vector.transfer_read is
773 /// immediately registered to replace all uses of the AffineLoadOp in this
774 /// pattern's scope.
775 ///
776 /// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They
777 /// need to be vectorized late once all the use-def chains have been traversed.
778 /// Additionally, they may have ssa-values operands which come from outside the
779 /// scope of the current pattern.
780 /// Such special cases force us to delay the vectorization of the stores until
781 /// the last step. Here we merely register the store operation.
782 template <typename LoadOrStoreOpPointer>
783 static LogicalResult vectorizeRootOrTerminal(Value iv,
784                                              LoadOrStoreOpPointer memoryOp,
785                                              VectorizationState *state) {
786   auto memRefType = memoryOp.getMemRef().getType().template cast<MemRefType>();
787 
788   auto elementType = memRefType.getElementType();
789   // TODO(ntv): ponder whether we want to further vectorize a vector value.
790   assert(VectorType::isValidElementType(elementType) &&
791          "Not a valid vector element type");
792   auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType);
793 
794   // Materialize a MemRef with 1 vector.
795   auto *opInst = memoryOp.getOperation();
796   // For now, vector.transfers must be aligned, operate only on indices with an
797   // identity subset of AffineMap and do not change layout.
798   // TODO(ntv): increase the expressiveness power of vector.transfer operations
799   // as needed by various targets.
800   if (auto load = dyn_cast<AffineLoadOp>(opInst)) {
801     OpBuilder b(opInst);
802     ValueRange mapOperands = load.getMapOperands();
803     SmallVector<Value, 8> indices;
804     indices.reserve(load.getMemRefType().getRank());
805     if (load.getAffineMap() !=
806         b.getMultiDimIdentityMap(load.getMemRefType().getRank())) {
807       computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices);
808     } else {
809       indices.append(mapOperands.begin(), mapOperands.end());
810     }
811     auto permutationMap =
812         makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
813     if (!permutationMap)
814       return LogicalResult::Failure;
815     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
816     LLVM_DEBUG(permutationMap.print(dbgs()));
817     auto transfer = b.create<vector::TransferReadOp>(
818         opInst->getLoc(), vectorType, memoryOp.getMemRef(), indices,
819         AffineMapAttr::get(permutationMap),
820         // TODO(b/144455320) add a proper padding value, not just 0.0 : f32
821         state->folder->create<ConstantFloatOp>(b, opInst->getLoc(),
822                                                APFloat(0.0f), b.getF32Type()));
823     state->registerReplacement(opInst, transfer.getOperation());
824   } else {
825     state->registerTerminal(opInst);
826   }
827   return success();
828 }
829 /// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ///
830 
831 /// Coarsens the loops bounds and transforms all remaining load and store
832 /// operations into the appropriate vector.transfer.
833 static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
834                                           VectorizationState *state) {
835   using namespace functional;
836   loop.setStep(step);
837 
838   FilterFunctionType notVectorizedThisPattern = [state](Operation &op) {
839     if (!matcher::isLoadOrStore(op)) {
840       return false;
841     }
842     return state->vectorizationMap.count(&op) == 0 &&
843            state->vectorizedSet.count(&op) == 0 &&
844            state->roots.count(&op) == 0 && state->terminals.count(&op) == 0;
845   };
846   auto loadAndStores = matcher::Op(notVectorizedThisPattern);
847   SmallVector<NestedMatch, 8> loadAndStoresMatches;
848   loadAndStores.match(loop.getOperation(), &loadAndStoresMatches);
849   for (auto ls : loadAndStoresMatches) {
850     auto *opInst = ls.getMatchedOperation();
851     auto load = dyn_cast<AffineLoadOp>(opInst);
852     auto store = dyn_cast<AffineStoreOp>(opInst);
853     LLVM_DEBUG(opInst->print(dbgs()));
854     LogicalResult result =
855         load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state)
856              : vectorizeRootOrTerminal(loop.getInductionVar(), store, state);
857     if (failed(result)) {
858       return failure();
859     }
860   }
861   return success();
862 }
863 
864 /// Returns a FilterFunctionType that can be used in NestedPattern to match a
865 /// loop whose underlying load/store accesses are either invariant or all
866 // varying along the `fastestVaryingMemRefDimension`.
867 static FilterFunctionType
868 isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
869                              int fastestVaryingMemRefDimension) {
870   return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
871     auto loop = cast<AffineForOp>(forOp);
872     auto parallelIt = parallelLoops.find(loop);
873     if (parallelIt == parallelLoops.end())
874       return false;
875     int memRefDim = -1;
876     auto vectorizableBody =
877         isVectorizableLoopBody(loop, &memRefDim, vectorTransferPattern());
878     if (!vectorizableBody)
879       return false;
880     return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
881            memRefDim == fastestVaryingMemRefDimension;
882   };
883 }
884 
885 /// Apply vectorization of `loop` according to `state`. This is only triggered
886 /// if all vectorizations in `childrenMatches` have already succeeded
887 /// recursively in DFS post-order.
888 static LogicalResult
889 vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
890                                   VectorizationState *state) {
891   auto *loopInst = oneMatch.getMatchedOperation();
892   auto loop = cast<AffineForOp>(loopInst);
893   auto childrenMatches = oneMatch.getMatchedChildren();
894 
895   // 1. DFS postorder recursion, if any of my children fails, I fail too.
896   for (auto m : childrenMatches) {
897     if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
898       return failure();
899     }
900   }
901 
902   // 2. This loop may have been omitted from vectorization for various reasons
903   // (e.g. due to the performance model or pattern depth > vector size).
904   auto it = state->strategy->loopToVectorDim.find(loopInst);
905   if (it == state->strategy->loopToVectorDim.end()) {
906     return success();
907   }
908 
909   // 3. Actual post-order transformation.
910   auto vectorDim = it->second;
911   assert(vectorDim < state->strategy->vectorSizes.size() &&
912          "vector dim overflow");
913   //   a. get actual vector size
914   auto vectorSize = state->strategy->vectorSizes[vectorDim];
915   //   b. loop transformation for early vectorization is still subject to
916   //     exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
917   //        | ub -> ub
918   //        | step -> step * vectorSize
919   LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
920                     << " : ");
921   LLVM_DEBUG(loopInst->print(dbgs()));
922   return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
923 }
924 
925 /// Tries to transform a scalar constant into a vector splat of that constant.
926 /// Returns the vectorized splat operation if the constant is a valid vector
927 /// element type.
928 /// If `type` is not a valid vector type or if the scalar constant is not a
929 /// valid vector element type, returns nullptr.
930 static Value vectorizeConstant(Operation *op, ConstantOp constant, Type type) {
931   if (!type || !type.isa<VectorType>() ||
932       !VectorType::isValidElementType(constant.getType())) {
933     return nullptr;
934   }
935   OpBuilder b(op);
936   Location loc = op->getLoc();
937   auto vectorType = type.cast<VectorType>();
938   auto attr = DenseElementsAttr::get(vectorType, constant.getValue());
939   auto *constantOpInst = constant.getOperation();
940 
941   OperationState state(loc, constantOpInst->getName().getStringRef(), {},
942                        {vectorType}, {b.getNamedAttr("value", attr)});
943 
944   return b.createOperation(state)->getResult(0);
945 }
946 
947 /// Tries to vectorize a given operand `op` of Operation `op` during
948 /// def-chain propagation or during terminal vectorization, by applying the
949 /// following logic:
950 /// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized
951 ///    useby -def propagation), `op` is already in the proper vector form;
952 /// 2. otherwise, the `op` may be in some other vector form that fails to
953 ///    vectorize atm (i.e. broadcasting required), returns nullptr to indicate
954 ///    failure;
955 /// 3. if the `op` is a constant, returns the vectorized form of the constant;
956 /// 4. non-constant scalars are currently non-vectorizable, in particular to
957 ///    guard against vectorizing an index which may be loop-variant and needs
958 ///    special handling.
959 ///
960 /// In particular this logic captures some of the use cases where definitions
961 /// that are not scoped under the current pattern are needed to vectorize.
962 /// One such example is top level function constants that need to be splatted.
963 ///
964 /// Returns an operand that has been vectorized to match `state`'s strategy if
965 /// vectorization is possible with the above logic. Returns nullptr otherwise.
966 ///
967 /// TODO(ntv): handle more complex cases.
968 static Value vectorizeOperand(Value operand, Operation *op,
969                               VectorizationState *state) {
970   LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: " << operand);
971   // 1. If this value has already been vectorized this round, we are done.
972   if (state->vectorizedSet.count(operand.getDefiningOp()) > 0) {
973     LLVM_DEBUG(dbgs() << " -> already vector operand");
974     return operand;
975   }
976   // 1.b. Delayed on-demand replacement of a use.
977   //    Note that we cannot just call replaceAllUsesWith because it may result
978   //    in ops with mixed types, for ops whose operands have not all yet
979   //    been vectorized. This would be invalid IR.
980   auto it = state->replacementMap.find(operand);
981   if (it != state->replacementMap.end()) {
982     auto res = it->second;
983     LLVM_DEBUG(dbgs() << "-> delayed replacement by: " << res);
984     return res;
985   }
986   // 2. TODO(ntv): broadcast needed.
987   if (operand.getType().isa<VectorType>()) {
988     LLVM_DEBUG(dbgs() << "-> non-vectorizable");
989     return nullptr;
990   }
991   // 3. vectorize constant.
992   if (auto constant = dyn_cast_or_null<ConstantOp>(operand.getDefiningOp())) {
993     return vectorizeConstant(
994         op, constant,
995         VectorType::get(state->strategy->vectorSizes, operand.getType()));
996   }
997   // 4. currently non-vectorizable.
998   LLVM_DEBUG(dbgs() << "-> non-vectorizable: " << operand);
999   return nullptr;
1000 }
1001 
1002 /// Encodes Operation-specific behavior for vectorization. In general we assume
1003 /// that all operands of an op must be vectorized but this is not always true.
1004 /// In the future, it would be nice to have a trait that describes how a
1005 /// particular operation vectorizes. For now we implement the case distinction
1006 /// here.
1007 /// Returns a vectorized form of an operation or nullptr if vectorization fails.
1008 // TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
1009 // Maybe some Ops are not vectorizable or require some tricky logic, we cannot
1010 // do one-off logic here; ideally it would be TableGen'd.
1011 static Operation *vectorizeOneOperation(Operation *opInst,
1012                                         VectorizationState *state) {
1013   // Sanity checks.
1014   assert(!isa<AffineLoadOp>(opInst) &&
1015          "all loads must have already been fully vectorized independently");
1016   assert(!isa<vector::TransferReadOp>(opInst) &&
1017          "vector.transfer_read cannot be further vectorized");
1018   assert(!isa<vector::TransferWriteOp>(opInst) &&
1019          "vector.transfer_write cannot be further vectorized");
1020 
1021   if (auto store = dyn_cast<AffineStoreOp>(opInst)) {
1022     OpBuilder b(opInst);
1023     auto memRef = store.getMemRef();
1024     auto value = store.getValueToStore();
1025     auto vectorValue = vectorizeOperand(value, opInst, state);
1026     if (!vectorValue)
1027       return nullptr;
1028 
1029     ValueRange mapOperands = store.getMapOperands();
1030     SmallVector<Value, 8> indices;
1031     indices.reserve(store.getMemRefType().getRank());
1032     if (store.getAffineMap() !=
1033         b.getMultiDimIdentityMap(store.getMemRefType().getRank())) {
1034       computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands,
1035                              indices);
1036     } else {
1037       indices.append(mapOperands.begin(), mapOperands.end());
1038     }
1039 
1040     auto permutationMap =
1041         makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
1042     if (!permutationMap)
1043       return nullptr;
1044     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
1045     LLVM_DEBUG(permutationMap.print(dbgs()));
1046     auto transfer = b.create<vector::TransferWriteOp>(
1047         opInst->getLoc(), vectorValue, memRef, indices,
1048         AffineMapAttr::get(permutationMap));
1049     auto *res = transfer.getOperation();
1050     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
1051     // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
1052     opInst->erase();
1053     return res;
1054   }
1055   if (opInst->getNumRegions() != 0)
1056     return nullptr;
1057 
1058   SmallVector<Type, 8> vectorTypes;
1059   for (auto v : opInst->getResults()) {
1060     vectorTypes.push_back(
1061         VectorType::get(state->strategy->vectorSizes, v.getType()));
1062   }
1063   SmallVector<Value, 8> vectorOperands;
1064   for (auto v : opInst->getOperands()) {
1065     vectorOperands.push_back(vectorizeOperand(v, opInst, state));
1066   }
1067   // Check whether a single operand is null. If so, vectorization failed.
1068   bool success = llvm::all_of(vectorOperands, [](Value op) { return op; });
1069   if (!success) {
1070     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize");
1071     return nullptr;
1072   }
1073 
1074   // Create a clone of the op with the proper operands and return types.
1075   // TODO(ntv): The following assumes there is always an op with a fixed
1076   // name that works both in scalar mode and vector mode.
1077   // TODO(ntv): Is it worth considering an Operation.clone operation which
1078   // changes the type so we can promote an Operation with less boilerplate?
1079   OpBuilder b(opInst);
1080   OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(),
1081                        vectorOperands, vectorTypes, opInst->getAttrs(),
1082                        /*successors=*/{},
1083                        /*regions=*/{}, opInst->hasResizableOperandsList());
1084   return b.createOperation(newOp);
1085 }
1086 
1087 /// Iterates over the forward slice from the loads in the vectorization pattern
1088 /// and rewrites them using their vectorized counterpart by:
1089 ///   1. Create the forward slice starting from the loads in the vectorization
1090 ///   pattern.
1091 ///   2. Topologically sorts the forward slice.
1092 ///   3. For each operation in the slice, create the vector form of this
1093 ///   operation, replacing each operand by a replacement operands retrieved from
1094 ///   replacementMap. If any such replacement is missing, vectorization fails.
1095 static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
1096   // 1. create initial worklist with the uses of the roots.
1097   SetVector<Operation *> worklist;
1098   // Note: state->roots have already been vectorized and must not be vectorized
1099   // again. This fits `getForwardSlice` which does not insert `op` in the
1100   // result.
1101   // Note: we have to exclude terminals because some of their defs may not be
1102   // nested under the vectorization pattern (e.g. constants defined in an
1103   // encompassing scope).
1104   // TODO(ntv): Use a backward slice for terminals, avoid special casing and
1105   // merge implementations.
1106   for (auto *op : state->roots) {
1107     getForwardSlice(op, &worklist, [state](Operation *op) {
1108       return state->terminals.count(op) == 0; // propagate if not terminal
1109     });
1110   }
1111   // We merged multiple slices, topological order may not hold anymore.
1112   worklist = topologicalSort(worklist);
1113 
1114   for (unsigned i = 0; i < worklist.size(); ++i) {
1115     auto *op = worklist[i];
1116     LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: ");
1117     LLVM_DEBUG(op->print(dbgs()));
1118 
1119     // Create vector form of the operation.
1120     // Insert it just before op, on success register op as replaced.
1121     auto *vectorizedInst = vectorizeOneOperation(op, state);
1122     if (!vectorizedInst) {
1123       return failure();
1124     }
1125 
1126     // 3. Register replacement for future uses in the scope.
1127     //    Note that we cannot just call replaceAllUsesWith because it may
1128     //    result in ops with mixed types, for ops whose operands have not all
1129     //    yet been vectorized. This would be invalid IR.
1130     state->registerReplacement(op, vectorizedInst);
1131   }
1132   return success();
1133 }
1134 
1135 /// Vectorization is a recursive procedure where anything below can fail.
1136 /// The root match thus needs to maintain a clone for handling failure.
1137 /// Each root may succeed independently but will otherwise clean after itself if
1138 /// anything below it fails.
1139 static LogicalResult vectorizeRootMatch(NestedMatch m,
1140                                         VectorizationStrategy *strategy) {
1141   auto loop = cast<AffineForOp>(m.getMatchedOperation());
1142   OperationFolder folder(loop.getContext());
1143   VectorizationState state;
1144   state.strategy = strategy;
1145   state.folder = &folder;
1146 
1147   // Since patterns are recursive, they can very well intersect.
1148   // Since we do not want a fully greedy strategy in general, we decouple
1149   // pattern matching, from profitability analysis, from application.
1150   // As a consequence we must check that each root pattern is still
1151   // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
1152   // TODO(ntv): implement a non-greedy profitability analysis that keeps only
1153   // non-intersecting patterns.
1154   if (!isVectorizableLoopBody(loop, vectorTransferPattern())) {
1155     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
1156     return failure();
1157   }
1158 
1159   /// Sets up error handling for this root loop. This is how the root match
1160   /// maintains a clone for handling failure and restores the proper state via
1161   /// RAII.
1162   auto *loopInst = loop.getOperation();
1163   OpBuilder builder(loopInst);
1164   auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
1165   struct Guard {
1166     LogicalResult failure() {
1167       loop.getInductionVar().replaceAllUsesWith(clonedLoop.getInductionVar());
1168       loop.erase();
1169       return mlir::failure();
1170     }
1171     LogicalResult success() {
1172       clonedLoop.erase();
1173       return mlir::success();
1174     }
1175     AffineForOp loop;
1176     AffineForOp clonedLoop;
1177   } guard{loop, clonedLoop};
1178 
1179   //////////////////////////////////////////////////////////////////////////////
1180   // Start vectorizing.
1181   // From now on, any error triggers the scope guard above.
1182   //////////////////////////////////////////////////////////////////////////////
1183   // 1. Vectorize all the loops matched by the pattern, recursively.
1184   // This also vectorizes the roots (AffineLoadOp) as well as registers the
1185   // terminals (AffineStoreOp) for post-processing vectorization (we need to
1186   // wait for all use-def chains into them to be vectorized first).
1187   if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
1188     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
1189     return guard.failure();
1190   }
1191 
1192   // 2. Vectorize operations reached by use-def chains from root except the
1193   // terminals (store operations) that need to be post-processed separately.
1194   // TODO(ntv): add more as we expand.
1195   if (failed(vectorizeNonTerminals(&state))) {
1196     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals");
1197     return guard.failure();
1198   }
1199 
1200   // 3. Post-process terminals.
1201   // Note: we have to post-process terminals because some of their defs may not
1202   // be nested under the vectorization pattern (e.g. constants defined in an
1203   // encompassing scope).
1204   // TODO(ntv): Use a backward slice for terminals, avoid special casing and
1205   // merge implementations.
1206   for (auto *op : state.terminals) {
1207     if (!vectorizeOneOperation(op, &state)) { // nullptr == failure
1208       LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals");
1209       return guard.failure();
1210     }
1211   }
1212 
1213   // 4. Finish this vectorization pattern.
1214   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
1215   state.finishVectorizationPattern();
1216   return guard.success();
1217 }
1218 
1219 /// Applies vectorization to the current Function by searching over a bunch of
1220 /// predetermined patterns.
1221 void Vectorize::runOnFunction() {
1222   FuncOp f = getFunction();
1223   if (!fastestVaryingPattern.empty() &&
1224       fastestVaryingPattern.size() != vectorSizes.size()) {
1225     f.emitRemark("Fastest varying pattern specified with different size than "
1226                  "the vector size.");
1227     return signalPassFailure();
1228   }
1229 
1230   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
1231   NestedPatternContext mlContext;
1232 
1233   DenseSet<Operation *> parallelLoops;
1234   f.walk([&parallelLoops](AffineForOp loop) {
1235     if (isLoopParallel(loop))
1236       parallelLoops.insert(loop);
1237   });
1238 
1239   for (auto &pat :
1240        makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) {
1241     LLVM_DEBUG(dbgs() << "\n******************************************");
1242     LLVM_DEBUG(dbgs() << "\n******************************************");
1243     LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");
1244     LLVM_DEBUG(f.print(dbgs()));
1245     unsigned patternDepth = pat.getDepth();
1246 
1247     SmallVector<NestedMatch, 8> matches;
1248     pat.match(f, &matches);
1249     // Iterate over all the top-level matches and vectorize eagerly.
1250     // This automatically prunes intersecting matches.
1251     for (auto m : matches) {
1252       VectorizationStrategy strategy;
1253       // TODO(ntv): depending on profitability, elect to reduce the vector size.
1254       strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
1255       if (failed(analyzeProfitability(m.getMatchedChildren(), 1, patternDepth,
1256                                       &strategy))) {
1257         continue;
1258       }
1259       vectorizeLoopIfProfitable(m.getMatchedOperation(), 0, patternDepth,
1260                                 &strategy);
1261       // TODO(ntv): if pattern does not apply, report it; alter the
1262       // cost/benefit.
1263       vectorizeRootMatch(m, &strategy);
1264       // TODO(ntv): some diagnostics if failure to vectorize occurs.
1265     }
1266   }
1267   LLVM_DEBUG(dbgs() << "\n");
1268 }
1269 
1270 std::unique_ptr<OpPassBase<FuncOp>>
1271 mlir::createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
1272   return std::make_unique<Vectorize>(virtualVectorSize);
1273 }
1274 
1275 static PassRegistration<Vectorize>
1276     pass("affine-super-vectorize",
1277          "Vectorize to a target independent n-D vector abstraction");
1278