1 //===- SuperVectorize.cpp - Vectorize Pass Impl ---------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements vectorization of loops, operations and data types to 10 // a target-independent, n-D super-vector abstraction. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PassDetail.h" 15 #include "mlir/Analysis/LoopAnalysis.h" 16 #include "mlir/Analysis/NestedMatcher.h" 17 #include "mlir/Analysis/SliceAnalysis.h" 18 #include "mlir/Analysis/Utils.h" 19 #include "mlir/Dialect/Affine/IR/AffineOps.h" 20 #include "mlir/Dialect/Affine/Passes.h" 21 #include "mlir/Dialect/Affine/Utils.h" 22 #include "mlir/Dialect/StandardOps/IR/Ops.h" 23 #include "mlir/Dialect/Vector/VectorOps.h" 24 #include "mlir/Dialect/Vector/VectorUtils.h" 25 #include "mlir/IR/AffineExpr.h" 26 #include "mlir/IR/Builders.h" 27 #include "mlir/IR/Location.h" 28 #include "mlir/IR/Types.h" 29 #include "mlir/Support/LLVM.h" 30 #include "mlir/Transforms/FoldUtils.h" 31 32 #include "llvm/ADT/DenseMap.h" 33 #include "llvm/ADT/DenseSet.h" 34 #include "llvm/ADT/SetVector.h" 35 #include "llvm/ADT/SmallString.h" 36 #include "llvm/ADT/SmallVector.h" 37 #include "llvm/Support/CommandLine.h" 38 #include "llvm/Support/Debug.h" 39 40 using namespace mlir; 41 using namespace vector; 42 43 /// 44 /// Implements a high-level vectorization strategy on a Function. 45 /// The abstraction used is that of super-vectors, which provide a single, 46 /// compact, representation in the vector types, information that is expected 47 /// to reduce the impact of the phase ordering problem 48 /// 49 /// Vector granularity: 50 /// =================== 51 /// This pass is designed to perform vectorization at a super-vector 52 /// granularity. A super-vector is loosely defined as a vector type that is a 53 /// multiple of a "good" vector size so the HW can efficiently implement a set 54 /// of high-level primitives. Multiple is understood along any dimension; e.g. 55 /// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a 56 /// vector<8xf32> HW vector. Note that a "good vector size so the HW can 57 /// efficiently implement a set of high-level primitives" is not necessarily an 58 /// integer multiple of actual hardware registers. We leave details of this 59 /// distinction unspecified for now. 60 /// 61 /// Some may prefer the terminology a "tile of HW vectors". In this case, one 62 /// should note that super-vectors implement an "always full tile" abstraction. 63 /// They guarantee no partial-tile separation is necessary by relying on a 64 /// high-level copy-reshape abstraction that we call vector.transfer. This 65 /// copy-reshape operations is also responsible for performing layout 66 /// transposition if necessary. In the general case this will require a scoped 67 /// allocation in some notional local memory. 68 /// 69 /// Whatever the mental model one prefers to use for this abstraction, the key 70 /// point is that we burn into a single, compact, representation in the vector 71 /// types, information that is expected to reduce the impact of the phase 72 /// ordering problem. Indeed, a vector type conveys information that: 73 /// 1. the associated loops have dependency semantics that do not prevent 74 /// vectorization; 75 /// 2. the associate loops have been sliced in chunks of static sizes that are 76 /// compatible with vector sizes (i.e. similar to unroll-and-jam); 77 /// 3. the inner loops, in the unroll-and-jam analogy of 2, are captured by 78 /// the 79 /// vector type and no vectorization hampering transformations can be 80 /// applied to them anymore; 81 /// 4. the underlying memrefs are accessed in some notional contiguous way 82 /// that allows loading into vectors with some amount of spatial locality; 83 /// In other words, super-vectorization provides a level of separation of 84 /// concern by way of opacity to subsequent passes. This has the effect of 85 /// encapsulating and propagating vectorization constraints down the list of 86 /// passes until we are ready to lower further. 87 /// 88 /// For a particular target, a notion of minimal n-d vector size will be 89 /// specified and vectorization targets a multiple of those. In the following 90 /// paragraph, let "k ." represent "a multiple of", to be understood as a 91 /// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes 92 /// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc). 93 /// 94 /// Some non-exhaustive notable super-vector sizes of interest include: 95 /// - CPU: vector<k . HW_vector_size>, 96 /// vector<k' . core_count x k . HW_vector_size>, 97 /// vector<socket_count x k' . core_count x k . HW_vector_size>; 98 /// - GPU: vector<k . warp_size>, 99 /// vector<k . warp_size x float2>, 100 /// vector<k . warp_size x float4>, 101 /// vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes). 102 /// 103 /// Loops and operations are emitted that operate on those super-vector shapes. 104 /// Subsequent lowering passes will materialize to actual HW vector sizes. These 105 /// passes are expected to be (gradually) more target-specific. 106 /// 107 /// At a high level, a vectorized load in a loop will resemble: 108 /// ```mlir 109 /// affine.for %i = ? to ? step ? { 110 /// %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32> 111 /// } 112 /// ``` 113 /// It is the responsibility of the implementation of vector.transfer_read to 114 /// materialize vector registers from the original scalar memrefs. A later (more 115 /// target-dependent) lowering pass will materialize to actual HW vector sizes. 116 /// This lowering may be occur at different times: 117 /// 1. at the MLIR level into a combination of loops, unrolling, DmaStartOp + 118 /// DmaWaitOp + vectorized operations for data transformations and shuffle; 119 /// thus opening opportunities for unrolling and pipelining. This is an 120 /// instance of library call "whiteboxing"; or 121 /// 2. later in the a target-specific lowering pass or hand-written library 122 /// call; achieving full separation of concerns. This is an instance of 123 /// library call; or 124 /// 3. a mix of both, e.g. based on a model. 125 /// In the future, these operations will expose a contract to constrain the 126 /// search on vectorization patterns and sizes. 127 /// 128 /// Occurrence of super-vectorization in the compiler flow: 129 /// ======================================================= 130 /// This is an active area of investigation. We start with 2 remarks to position 131 /// super-vectorization in the context of existing ongoing work: LLVM VPLAN 132 /// and LLVM SLP Vectorizer. 133 /// 134 /// LLVM VPLAN: 135 /// ----------- 136 /// The astute reader may have noticed that in the limit, super-vectorization 137 /// can be applied at a similar time and with similar objectives than VPLAN. 138 /// For instance, in the case of a traditional, polyhedral compilation-flow (for 139 /// instance, the PPCG project uses ISL to provide dependence analysis, 140 /// multi-level(scheduling + tiling), lifting footprint to fast memory, 141 /// communication synthesis, mapping, register optimizations) and before 142 /// unrolling. When vectorization is applied at this *late* level in a typical 143 /// polyhedral flow, and is instantiated with actual hardware vector sizes, 144 /// super-vectorization is expected to match (or subsume) the type of patterns 145 /// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR 146 /// is higher level and our implementation should be significantly simpler. Also 147 /// note that in this mode, recursive patterns are probably a bit of an overkill 148 /// although it is reasonable to expect that mixing a bit of outer loop and 149 /// inner loop vectorization + unrolling will provide interesting choices to 150 /// MLIR. 151 /// 152 /// LLVM SLP Vectorizer: 153 /// -------------------- 154 /// Super-vectorization however is not meant to be usable in a similar fashion 155 /// to the SLP vectorizer. The main difference lies in the information that 156 /// both vectorizers use: super-vectorization examines contiguity of memory 157 /// references along fastest varying dimensions and loops with recursive nested 158 /// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on 159 /// the other hand, performs flat pattern matching inside a single unrolled loop 160 /// body and stitches together pieces of load and store operations into full 161 /// 1-D vectors. We envision that the SLP vectorizer is a good way to capture 162 /// innermost loop, control-flow dependent patterns that super-vectorization may 163 /// not be able to capture easily. In other words, super-vectorization does not 164 /// aim at replacing the SLP vectorizer and the two solutions are complementary. 165 /// 166 /// Ongoing investigations: 167 /// ----------------------- 168 /// We discuss the following *early* places where super-vectorization is 169 /// applicable and touch on the expected benefits and risks . We list the 170 /// opportunities in the context of the traditional polyhedral compiler flow 171 /// described in PPCG. There are essentially 6 places in the MLIR pass pipeline 172 /// we expect to experiment with super-vectorization: 173 /// 1. Right after language lowering to MLIR: this is the earliest time where 174 /// super-vectorization is expected to be applied. At this level, all the 175 /// language/user/library-level annotations are available and can be fully 176 /// exploited. Examples include loop-type annotations (such as parallel, 177 /// reduction, scan, dependence distance vector, vectorizable) as well as 178 /// memory access annotations (such as non-aliasing writes guaranteed, 179 /// indirect accesses that are permutations by construction) accesses or 180 /// that a particular operation is prescribed atomic by the user. At this 181 /// level, anything that enriches what dependence analysis can do should be 182 /// aggressively exploited. At this level we are close to having explicit 183 /// vector types in the language, except we do not impose that burden on the 184 /// programmer/library: we derive information from scalar code + annotations. 185 /// 2. After dependence analysis and before polyhedral scheduling: the 186 /// information that supports vectorization does not need to be supplied by a 187 /// higher level of abstraction. Traditional dependence analysis is available 188 /// in MLIR and will be used to drive vectorization and cost models. 189 /// 190 /// Let's pause here and remark that applying super-vectorization as described 191 /// in 1. and 2. presents clear opportunities and risks: 192 /// - the opportunity is that vectorization is burned in the type system and 193 /// is protected from the adverse effect of loop scheduling, tiling, loop 194 /// interchange and all passes downstream. Provided that subsequent passes are 195 /// able to operate on vector types; the vector shapes, associated loop 196 /// iterator properties, alignment, and contiguity of fastest varying 197 /// dimensions are preserved until we lower the super-vector types. We expect 198 /// this to significantly rein in on the adverse effects of phase ordering. 199 /// - the risks are that a. all passes after super-vectorization have to work 200 /// on elemental vector types (not that this is always true, wherever 201 /// vectorization is applied) and b. that imposing vectorization constraints 202 /// too early may be overall detrimental to loop fusion, tiling and other 203 /// transformations because the dependence distances are coarsened when 204 /// operating on elemental vector types. For this reason, the pattern 205 /// profitability analysis should include a component that also captures the 206 /// maximal amount of fusion available under a particular pattern. This is 207 /// still at the stage of rough ideas but in this context, search is our 208 /// friend as the Tensor Comprehensions and auto-TVM contributions 209 /// demonstrated previously. 210 /// Bottom-line is we do not yet have good answers for the above but aim at 211 /// making it easy to answer such questions. 212 /// 213 /// Back to our listing, the last places where early super-vectorization makes 214 /// sense are: 215 /// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known 216 /// to improve locality, parallelism and be configurable (e.g. max-fuse, 217 /// smart-fuse etc). They can also have adverse effects on contiguity 218 /// properties that are required for vectorization but the vector.transfer 219 /// copy-reshape-pad-transpose abstraction is expected to help recapture 220 /// these properties. 221 /// 4. right after polyhedral-style scheduling+tiling; 222 /// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent 223 /// probably the most promising places because applying tiling achieves a 224 /// separation of concerns that allows rescheduling to worry less about 225 /// locality and more about parallelism and distribution (e.g. min-fuse). 226 /// 227 /// At these levels the risk-reward looks different: on one hand we probably 228 /// lost a good deal of language/user/library-level annotation; on the other 229 /// hand we gained parallelism and locality through scheduling and tiling. 230 /// However we probably want to ensure tiling is compatible with the 231 /// full-tile-only abstraction used in super-vectorization or suffer the 232 /// consequences. It is too early to place bets on what will win but we expect 233 /// super-vectorization to be the right abstraction to allow exploring at all 234 /// these levels. And again, search is our friend. 235 /// 236 /// Lastly, we mention it again here: 237 /// 6. as a MLIR-based alternative to VPLAN. 238 /// 239 /// Lowering, unrolling, pipelining: 240 /// ================================ 241 /// TODO: point to the proper places. 242 /// 243 /// Algorithm: 244 /// ========== 245 /// The algorithm proceeds in a few steps: 246 /// 1. defining super-vectorization patterns and matching them on the tree of 247 /// AffineForOp. A super-vectorization pattern is defined as a recursive 248 /// data structures that matches and captures nested, imperfectly-nested 249 /// loops that have a. conformable loop annotations attached (e.g. parallel, 250 /// reduction, vectorizable, ...) as well as b. all contiguous load/store 251 /// operations along a specified minor dimension (not necessarily the 252 /// fastest varying) ; 253 /// 2. analyzing those patterns for profitability (TODO: and 254 /// interference); 255 /// 3. Then, for each pattern in order: 256 /// a. applying iterative rewriting of the loop and the load operations in 257 /// inner-to-outer order. Rewriting is implemented by coarsening the loops 258 /// and turning load operations into opaque vector.transfer_read ops; 259 /// b. keeping track of the load operations encountered as "roots" and the 260 /// store operations as "terminals"; 261 /// c. traversing the use-def chains starting from the roots and iteratively 262 /// propagating vectorized values. Scalar values that are encountered 263 /// during this process must come from outside the scope of the current 264 /// pattern (TODO: enforce this and generalize). Such a scalar value 265 /// is vectorized only if it is a constant (into a vector splat). The 266 /// non-constant case is not supported for now and results in the pattern 267 /// failing to vectorize; 268 /// d. performing a second traversal on the terminals (store ops) to 269 /// rewriting the scalar value they write to memory into vector form. 270 /// If the scalar value has been vectorized previously, we simply replace 271 /// it by its vector form. Otherwise, if the scalar value is a constant, 272 /// it is vectorized into a splat. In all other cases, vectorization for 273 /// the pattern currently fails. 274 /// e. if everything under the root AffineForOp in the current pattern 275 /// vectorizes properly, we commit that loop to the IR. Otherwise we 276 /// discard it and restore a previously cloned version of the loop. Thanks 277 /// to the recursive scoping nature of matchers and captured patterns, 278 /// this is transparently achieved by a simple RAII implementation. 279 /// f. vectorization is applied on the next pattern in the list. Because 280 /// pattern interference avoidance is not yet implemented and that we do 281 /// not support further vectorizing an already vector load we need to 282 /// re-verify that the pattern is still vectorizable. This is expected to 283 /// make cost models more difficult to write and is subject to improvement 284 /// in the future. 285 /// 286 /// Points c. and d. above are worth additional comment. In most passes that 287 /// do not change the type of operands, it is usually preferred to eagerly 288 /// `replaceAllUsesWith`. Unfortunately this does not work for vectorization 289 /// because during the use-def chain traversal, all the operands of an operation 290 /// must be available in vector form. Trying to propagate eagerly makes the IR 291 /// temporarily invalid and results in errors such as: 292 /// `vectorize.mlir:308:13: error: 'addf' op requires the same type for all 293 /// operands and results 294 /// %s5 = addf %a5, %b5 : f32` 295 /// 296 /// Lastly, we show a minimal example for which use-def chains rooted in load / 297 /// vector.transfer_read are not enough. This is what motivated splitting 298 /// terminal processing out of the use-def chains starting from loads. In the 299 /// following snippet, there is simply no load:: 300 /// ```mlir 301 /// func @fill(%A : memref<128xf32>) -> () { 302 /// %f1 = constant 1.0 : f32 303 /// affine.for %i0 = 0 to 32 { 304 /// affine.store %f1, %A[%i0] : memref<128xf32, 0> 305 /// } 306 /// return 307 /// } 308 /// ``` 309 /// 310 /// Choice of loop transformation to support the algorithm: 311 /// ======================================================= 312 /// The choice of loop transformation to apply for coarsening vectorized loops 313 /// is still subject to exploratory tradeoffs. In particular, say we want to 314 /// vectorize by a factor 128, we want to transform the following input: 315 /// ```mlir 316 /// affine.for %i = %M to %N { 317 /// %a = affine.load %A[%i] : memref<?xf32> 318 /// } 319 /// ``` 320 /// 321 /// Traditionally, one would vectorize late (after scheduling, tiling, 322 /// memory promotion etc) say after stripmining (and potentially unrolling in 323 /// the case of LLVM's SLP vectorizer): 324 /// ```mlir 325 /// affine.for %i = floor(%M, 128) to ceil(%N, 128) { 326 /// affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) { 327 /// %a = affine.load %A[%ii] : memref<?xf32> 328 /// } 329 /// } 330 /// ``` 331 /// 332 /// Instead, we seek to vectorize early and freeze vector types before 333 /// scheduling, so we want to generate a pattern that resembles: 334 /// ```mlir 335 /// affine.for %i = ? to ? step ? { 336 /// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32> 337 /// } 338 /// ``` 339 /// 340 /// i. simply dividing the lower / upper bounds by 128 creates issues 341 /// when representing expressions such as ii + 1 because now we only 342 /// have access to original values that have been divided. Additional 343 /// information is needed to specify accesses at below-128 granularity; 344 /// ii. another alternative is to coarsen the loop step but this may have 345 /// consequences on dependence analysis and fusability of loops: fusable 346 /// loops probably need to have the same step (because we don't want to 347 /// stripmine/unroll to enable fusion). 348 /// As a consequence, we choose to represent the coarsening using the loop 349 /// step for now and reevaluate in the future. Note that we can renormalize 350 /// loop steps later if/when we have evidence that they are problematic. 351 /// 352 /// For the simple strawman example above, vectorizing for a 1-D vector 353 /// abstraction of size 128 returns code similar to: 354 /// ```mlir 355 /// affine.for %i = %M to %N step 128 { 356 /// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32> 357 /// } 358 /// ``` 359 /// 360 /// Unsupported cases, extensions, and work in progress (help welcome :-) ): 361 /// ======================================================================== 362 /// 1. lowering to concrete vector types for various HW; 363 /// 2. reduction support; 364 /// 3. non-effecting padding during vector.transfer_read and filter during 365 /// vector.transfer_write; 366 /// 4. misalignment support vector.transfer_read / vector.transfer_write 367 /// (hopefully without read-modify-writes); 368 /// 5. control-flow support; 369 /// 6. cost-models, heuristics and search; 370 /// 7. Op implementation, extensions and implication on memref views; 371 /// 8. many TODOs left around. 372 /// 373 /// Examples: 374 /// ========= 375 /// Consider the following Function: 376 /// ```mlir 377 /// func @vector_add_2d(%M : index, %N : index) -> f32 { 378 /// %A = alloc (%M, %N) : memref<?x?xf32, 0> 379 /// %B = alloc (%M, %N) : memref<?x?xf32, 0> 380 /// %C = alloc (%M, %N) : memref<?x?xf32, 0> 381 /// %f1 = constant 1.0 : f32 382 /// %f2 = constant 2.0 : f32 383 /// affine.for %i0 = 0 to %M { 384 /// affine.for %i1 = 0 to %N { 385 /// // non-scoped %f1 386 /// affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0> 387 /// } 388 /// } 389 /// affine.for %i2 = 0 to %M { 390 /// affine.for %i3 = 0 to %N { 391 /// // non-scoped %f2 392 /// affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0> 393 /// } 394 /// } 395 /// affine.for %i4 = 0 to %M { 396 /// affine.for %i5 = 0 to %N { 397 /// %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0> 398 /// %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0> 399 /// %s5 = addf %a5, %b5 : f32 400 /// // non-scoped %f1 401 /// %s6 = addf %s5, %f1 : f32 402 /// // non-scoped %f2 403 /// %s7 = addf %s5, %f2 : f32 404 /// // diamond dependency. 405 /// %s8 = addf %s7, %s6 : f32 406 /// affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0> 407 /// } 408 /// } 409 /// %c7 = constant 7 : index 410 /// %c42 = constant 42 : index 411 /// %res = load %C[%c7, %c42] : memref<?x?xf32, 0> 412 /// return %res : f32 413 /// } 414 /// ``` 415 /// 416 /// The -affine-vectorize pass with the following arguments: 417 /// ``` 418 /// -affine-vectorize="virtual-vector-size=256 test-fastest-varying=0" 419 /// ``` 420 /// 421 /// produces this standard innermost-loop vectorized code: 422 /// ```mlir 423 /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { 424 /// %0 = alloc(%arg0, %arg1) : memref<?x?xf32> 425 /// %1 = alloc(%arg0, %arg1) : memref<?x?xf32> 426 /// %2 = alloc(%arg0, %arg1) : memref<?x?xf32> 427 /// %cst = constant 1.0 : f32 428 /// %cst_0 = constant 2.0 : f32 429 /// affine.for %i0 = 0 to %arg0 { 430 /// affine.for %i1 = 0 to %arg1 step 256 { 431 /// %cst_1 = constant dense<vector<256xf32>, 1.0> : 432 /// vector<256xf32> 433 /// vector.transfer_write %cst_1, %0[%i0, %i1] : 434 /// vector<256xf32>, memref<?x?xf32> 435 /// } 436 /// } 437 /// affine.for %i2 = 0 to %arg0 { 438 /// affine.for %i3 = 0 to %arg1 step 256 { 439 /// %cst_2 = constant dense<vector<256xf32>, 2.0> : 440 /// vector<256xf32> 441 /// vector.transfer_write %cst_2, %1[%i2, %i3] : 442 /// vector<256xf32>, memref<?x?xf32> 443 /// } 444 /// } 445 /// affine.for %i4 = 0 to %arg0 { 446 /// affine.for %i5 = 0 to %arg1 step 256 { 447 /// %3 = vector.transfer_read %0[%i4, %i5] : 448 /// memref<?x?xf32>, vector<256xf32> 449 /// %4 = vector.transfer_read %1[%i4, %i5] : 450 /// memref<?x?xf32>, vector<256xf32> 451 /// %5 = addf %3, %4 : vector<256xf32> 452 /// %cst_3 = constant dense<vector<256xf32>, 1.0> : 453 /// vector<256xf32> 454 /// %6 = addf %5, %cst_3 : vector<256xf32> 455 /// %cst_4 = constant dense<vector<256xf32>, 2.0> : 456 /// vector<256xf32> 457 /// %7 = addf %5, %cst_4 : vector<256xf32> 458 /// %8 = addf %7, %6 : vector<256xf32> 459 /// vector.transfer_write %8, %2[%i4, %i5] : 460 /// vector<256xf32>, memref<?x?xf32> 461 /// } 462 /// } 463 /// %c7 = constant 7 : index 464 /// %c42 = constant 42 : index 465 /// %9 = load %2[%c7, %c42] : memref<?x?xf32> 466 /// return %9 : f32 467 /// } 468 /// ``` 469 /// 470 /// The -affine-vectorize pass with the following arguments: 471 /// ``` 472 /// -affine-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0" 473 /// ``` 474 /// 475 /// produces this more interesting mixed outer-innermost-loop vectorized code: 476 /// ```mlir 477 /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { 478 /// %0 = alloc(%arg0, %arg1) : memref<?x?xf32> 479 /// %1 = alloc(%arg0, %arg1) : memref<?x?xf32> 480 /// %2 = alloc(%arg0, %arg1) : memref<?x?xf32> 481 /// %cst = constant 1.0 : f32 482 /// %cst_0 = constant 2.0 : f32 483 /// affine.for %i0 = 0 to %arg0 step 32 { 484 /// affine.for %i1 = 0 to %arg1 step 256 { 485 /// %cst_1 = constant dense<vector<32x256xf32>, 1.0> : 486 /// vector<32x256xf32> 487 /// vector.transfer_write %cst_1, %0[%i0, %i1] : 488 /// vector<32x256xf32>, memref<?x?xf32> 489 /// } 490 /// } 491 /// affine.for %i2 = 0 to %arg0 step 32 { 492 /// affine.for %i3 = 0 to %arg1 step 256 { 493 /// %cst_2 = constant dense<vector<32x256xf32>, 2.0> : 494 /// vector<32x256xf32> 495 /// vector.transfer_write %cst_2, %1[%i2, %i3] : 496 /// vector<32x256xf32>, memref<?x?xf32> 497 /// } 498 /// } 499 /// affine.for %i4 = 0 to %arg0 step 32 { 500 /// affine.for %i5 = 0 to %arg1 step 256 { 501 /// %3 = vector.transfer_read %0[%i4, %i5] : 502 /// memref<?x?xf32> vector<32x256xf32> 503 /// %4 = vector.transfer_read %1[%i4, %i5] : 504 /// memref<?x?xf32>, vector<32x256xf32> 505 /// %5 = addf %3, %4 : vector<32x256xf32> 506 /// %cst_3 = constant dense<vector<32x256xf32>, 1.0> : 507 /// vector<32x256xf32> 508 /// %6 = addf %5, %cst_3 : vector<32x256xf32> 509 /// %cst_4 = constant dense<vector<32x256xf32>, 2.0> : 510 /// vector<32x256xf32> 511 /// %7 = addf %5, %cst_4 : vector<32x256xf32> 512 /// %8 = addf %7, %6 : vector<32x256xf32> 513 /// vector.transfer_write %8, %2[%i4, %i5] : 514 /// vector<32x256xf32>, memref<?x?xf32> 515 /// } 516 /// } 517 /// %c7 = constant 7 : index 518 /// %c42 = constant 42 : index 519 /// %9 = load %2[%c7, %c42] : memref<?x?xf32> 520 /// return %9 : f32 521 /// } 522 /// ``` 523 /// 524 /// Of course, much more intricate n-D imperfectly-nested patterns can be 525 /// vectorized too and specified in a fully declarative fashion. 526 527 #define DEBUG_TYPE "early-vect" 528 529 using llvm::dbgs; 530 using llvm::SetVector; 531 532 /// Forward declaration. 533 static FilterFunctionType 534 isVectorizableLoopPtrFactory(const DenseSet<Operation *> ¶llelLoops, 535 int fastestVaryingMemRefDimension); 536 537 /// Creates a vectorization pattern from the command line arguments. 538 /// Up to 3-D patterns are supported. 539 /// If the command line argument requests a pattern of higher order, returns an 540 /// empty pattern list which will conservatively result in no vectorization. 541 static std::vector<NestedPattern> 542 makePatterns(const DenseSet<Operation *> ¶llelLoops, int vectorRank, 543 ArrayRef<int64_t> fastestVaryingPattern) { 544 using matcher::For; 545 int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0]; 546 int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1]; 547 int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2]; 548 switch (vectorRank) { 549 case 1: 550 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))}; 551 case 2: 552 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0), 553 For(isVectorizableLoopPtrFactory(parallelLoops, d1)))}; 554 case 3: 555 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0), 556 For(isVectorizableLoopPtrFactory(parallelLoops, d1), 557 For(isVectorizableLoopPtrFactory(parallelLoops, d2))))}; 558 default: { 559 return std::vector<NestedPattern>(); 560 } 561 } 562 } 563 564 static NestedPattern &vectorTransferPattern() { 565 static auto pattern = matcher::Op([](Operation &op) { 566 return isa<vector::TransferReadOp, vector::TransferWriteOp>(op); 567 }); 568 return pattern; 569 } 570 571 namespace { 572 573 /// Base state for the vectorize pass. 574 /// Command line arguments are preempted by non-empty pass arguments. 575 struct Vectorize : public AffineVectorizeBase<Vectorize> { 576 Vectorize() = default; 577 Vectorize(ArrayRef<int64_t> virtualVectorSize); 578 void runOnFunction() override; 579 }; 580 581 } // end anonymous namespace 582 583 Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) { 584 vectorSizes = virtualVectorSize; 585 } 586 587 static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern, 588 unsigned patternDepth, 589 VectorizationStrategy *strategy) { 590 assert(patternDepth > depthInPattern && 591 "patternDepth is greater than depthInPattern"); 592 if (patternDepth - depthInPattern > strategy->vectorSizes.size()) { 593 // Don't vectorize this loop 594 return; 595 } 596 strategy->loopToVectorDim[loop] = 597 strategy->vectorSizes.size() - (patternDepth - depthInPattern); 598 } 599 600 /// Implements a simple strawman strategy for vectorization. 601 /// Given a matched pattern `matches` of depth `patternDepth`, this strategy 602 /// greedily assigns the fastest varying dimension ** of the vector ** to the 603 /// innermost loop in the pattern. 604 /// When coupled with a pattern that looks for the fastest varying dimension in 605 /// load/store MemRefs, this creates a generic vectorization strategy that works 606 /// for any loop in a hierarchy (outermost, innermost or intermediate). 607 /// 608 /// TODO: In the future we should additionally increase the power of the 609 /// profitability analysis along 3 directions: 610 /// 1. account for loop extents (both static and parametric + annotations); 611 /// 2. account for data layout permutations; 612 /// 3. account for impact of vectorization on maximal loop fusion. 613 /// Then we can quantify the above to build a cost model and search over 614 /// strategies. 615 static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches, 616 unsigned depthInPattern, 617 unsigned patternDepth, 618 VectorizationStrategy *strategy) { 619 for (auto m : matches) { 620 if (failed(analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1, 621 patternDepth, strategy))) { 622 return failure(); 623 } 624 vectorizeLoopIfProfitable(m.getMatchedOperation(), depthInPattern, 625 patternDepth, strategy); 626 } 627 return success(); 628 } 629 630 ///// end TODO: Hoist to a VectorizationStrategy.cpp when appropriate ///// 631 632 namespace { 633 634 struct VectorizationState { 635 /// Adds an entry of pre/post vectorization operations in the state. 636 void registerReplacement(Operation *key, Operation *value); 637 /// When the current vectorization pattern is successful, this erases the 638 /// operations that were marked for erasure in the proper order and resets 639 /// the internal state for the next pattern. 640 void finishVectorizationPattern(); 641 642 // In-order tracking of original Operation that have been vectorized. 643 // Erase in reverse order. 644 SmallVector<Operation *, 16> toErase; 645 // Set of Operation that have been vectorized (the values in the 646 // vectorizationMap for hashed access). The vectorizedSet is used in 647 // particular to filter the operations that have already been vectorized by 648 // this pattern, when iterating over nested loops in this pattern. 649 DenseSet<Operation *> vectorizedSet; 650 // Map of old scalar Operation to new vectorized Operation. 651 DenseMap<Operation *, Operation *> vectorizationMap; 652 // Map of old scalar Value to new vectorized Value. 653 DenseMap<Value, Value> replacementMap; 654 // The strategy drives which loop to vectorize by which amount. 655 const VectorizationStrategy *strategy; 656 // Use-def roots. These represent the starting points for the worklist in the 657 // vectorizeNonTerminals function. They consist of the subset of load 658 // operations that have been vectorized. They can be retrieved from 659 // `vectorizationMap` but it is convenient to keep track of them in a separate 660 // data structure. 661 DenseSet<Operation *> roots; 662 // Terminal operations for the worklist in the vectorizeNonTerminals 663 // function. They consist of the subset of store operations that have been 664 // vectorized. They can be retrieved from `vectorizationMap` but it is 665 // convenient to keep track of them in a separate data structure. Since they 666 // do not necessarily belong to use-def chains starting from loads (e.g 667 // storing a constant), we need to handle them in a post-pass. 668 DenseSet<Operation *> terminals; 669 // Checks that the type of `op` is AffineStoreOp and adds it to the terminals 670 // set. 671 void registerTerminal(Operation *op); 672 // Folder used to factor out constant creation. 673 OperationFolder *folder; 674 675 private: 676 void registerReplacement(Value key, Value value); 677 }; 678 679 } // end namespace 680 681 void VectorizationState::registerReplacement(Operation *key, Operation *value) { 682 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: "); 683 LLVM_DEBUG(key->print(dbgs())); 684 LLVM_DEBUG(dbgs() << " into "); 685 LLVM_DEBUG(value->print(dbgs())); 686 assert(key->getNumResults() == 1 && "already registered"); 687 assert(value->getNumResults() == 1 && "already registered"); 688 assert(vectorizedSet.count(value) == 0 && "already registered"); 689 assert(vectorizationMap.count(key) == 0 && "already registered"); 690 toErase.push_back(key); 691 vectorizedSet.insert(value); 692 vectorizationMap.insert(std::make_pair(key, value)); 693 registerReplacement(key->getResult(0), value->getResult(0)); 694 if (isa<AffineLoadOp>(key)) { 695 assert(roots.count(key) == 0 && "root was already inserted previously"); 696 roots.insert(key); 697 } 698 } 699 700 void VectorizationState::registerTerminal(Operation *op) { 701 assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp"); 702 assert(terminals.count(op) == 0 && 703 "terminal was already inserted previously"); 704 terminals.insert(op); 705 } 706 707 void VectorizationState::finishVectorizationPattern() { 708 while (!toErase.empty()) { 709 auto *op = toErase.pop_back_val(); 710 LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: "); 711 LLVM_DEBUG(op->print(dbgs())); 712 op->erase(); 713 } 714 } 715 716 void VectorizationState::registerReplacement(Value key, Value value) { 717 assert(replacementMap.count(key) == 0 && "replacement already registered"); 718 replacementMap.insert(std::make_pair(key, value)); 719 } 720 721 // Apply 'map' with 'mapOperands' returning resulting values in 'results'. 722 static void computeMemoryOpIndices(Operation *op, AffineMap map, 723 ValueRange mapOperands, 724 SmallVectorImpl<Value> &results) { 725 OpBuilder builder(op); 726 for (auto resultExpr : map.getResults()) { 727 auto singleResMap = 728 AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr); 729 auto afOp = 730 builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands); 731 results.push_back(afOp); 732 } 733 } 734 735 ////// TODO: Hoist to a VectorizationMaterialize.cpp when appropriate. //// 736 737 /// Handles the vectorization of load and store MLIR operations. 738 /// 739 /// AffineLoadOp operations are the roots of the vectorizeNonTerminals call. 740 /// They are vectorized immediately. The resulting vector.transfer_read is 741 /// immediately registered to replace all uses of the AffineLoadOp in this 742 /// pattern's scope. 743 /// 744 /// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They 745 /// need to be vectorized late once all the use-def chains have been traversed. 746 /// Additionally, they may have ssa-values operands which come from outside the 747 /// scope of the current pattern. 748 /// Such special cases force us to delay the vectorization of the stores until 749 /// the last step. Here we merely register the store operation. 750 template <typename LoadOrStoreOpPointer> 751 static LogicalResult vectorizeRootOrTerminal(Value iv, 752 LoadOrStoreOpPointer memoryOp, 753 VectorizationState *state) { 754 auto memRefType = memoryOp.getMemRef().getType().template cast<MemRefType>(); 755 756 auto elementType = memRefType.getElementType(); 757 // TODO: ponder whether we want to further vectorize a vector value. 758 assert(VectorType::isValidElementType(elementType) && 759 "Not a valid vector element type"); 760 auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType); 761 762 // Materialize a MemRef with 1 vector. 763 auto *opInst = memoryOp.getOperation(); 764 // For now, vector.transfers must be aligned, operate only on indices with an 765 // identity subset of AffineMap and do not change layout. 766 // TODO: increase the expressiveness power of vector.transfer operations 767 // as needed by various targets. 768 if (auto load = dyn_cast<AffineLoadOp>(opInst)) { 769 OpBuilder b(opInst); 770 ValueRange mapOperands = load.getMapOperands(); 771 SmallVector<Value, 8> indices; 772 indices.reserve(load.getMemRefType().getRank()); 773 if (load.getAffineMap() != 774 b.getMultiDimIdentityMap(load.getMemRefType().getRank())) { 775 computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices); 776 } else { 777 indices.append(mapOperands.begin(), mapOperands.end()); 778 } 779 auto permutationMap = 780 makePermutationMap(opInst, indices, state->strategy->loopToVectorDim); 781 if (!permutationMap) 782 return LogicalResult::Failure; 783 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); 784 LLVM_DEBUG(permutationMap.print(dbgs())); 785 auto transfer = b.create<vector::TransferReadOp>( 786 opInst->getLoc(), vectorType, memoryOp.getMemRef(), indices, 787 permutationMap); 788 state->registerReplacement(opInst, transfer.getOperation()); 789 } else { 790 state->registerTerminal(opInst); 791 } 792 return success(); 793 } 794 /// end TODO: Hoist to a VectorizationMaterialize.cpp when appropriate. /// 795 796 /// Coarsens the loops bounds and transforms all remaining load and store 797 /// operations into the appropriate vector.transfer. 798 static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step, 799 VectorizationState *state) { 800 loop.setStep(step); 801 802 FilterFunctionType notVectorizedThisPattern = [state](Operation &op) { 803 if (!matcher::isLoadOrStore(op)) { 804 return false; 805 } 806 return state->vectorizationMap.count(&op) == 0 && 807 state->vectorizedSet.count(&op) == 0 && 808 state->roots.count(&op) == 0 && state->terminals.count(&op) == 0; 809 }; 810 auto loadAndStores = matcher::Op(notVectorizedThisPattern); 811 SmallVector<NestedMatch, 8> loadAndStoresMatches; 812 loadAndStores.match(loop.getOperation(), &loadAndStoresMatches); 813 for (auto ls : loadAndStoresMatches) { 814 auto *opInst = ls.getMatchedOperation(); 815 auto load = dyn_cast<AffineLoadOp>(opInst); 816 auto store = dyn_cast<AffineStoreOp>(opInst); 817 LLVM_DEBUG(opInst->print(dbgs())); 818 LogicalResult result = 819 load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state) 820 : vectorizeRootOrTerminal(loop.getInductionVar(), store, state); 821 if (failed(result)) { 822 return failure(); 823 } 824 } 825 return success(); 826 } 827 828 /// Returns a FilterFunctionType that can be used in NestedPattern to match a 829 /// loop whose underlying load/store accesses are either invariant or all 830 // varying along the `fastestVaryingMemRefDimension`. 831 static FilterFunctionType 832 isVectorizableLoopPtrFactory(const DenseSet<Operation *> ¶llelLoops, 833 int fastestVaryingMemRefDimension) { 834 return [¶llelLoops, fastestVaryingMemRefDimension](Operation &forOp) { 835 auto loop = cast<AffineForOp>(forOp); 836 auto parallelIt = parallelLoops.find(loop); 837 if (parallelIt == parallelLoops.end()) 838 return false; 839 int memRefDim = -1; 840 auto vectorizableBody = 841 isVectorizableLoopBody(loop, &memRefDim, vectorTransferPattern()); 842 if (!vectorizableBody) 843 return false; 844 return memRefDim == -1 || fastestVaryingMemRefDimension == -1 || 845 memRefDim == fastestVaryingMemRefDimension; 846 }; 847 } 848 849 /// Apply vectorization of `loop` according to `state`. `loops` are processed in 850 /// inner-to-outer order to ensure that all the children loops have already been 851 /// vectorized before vectorizing the parent loop. 852 static LogicalResult 853 vectorizeLoopsAndLoads(std::vector<SmallVector<AffineForOp, 2>> &loops, 854 VectorizationState *state) { 855 // Vectorize loops in inner-to-outer order. If any children fails, the parent 856 // will fail too. 857 for (auto &loopsInLevel : llvm::reverse(loops)) { 858 for (AffineForOp loop : loopsInLevel) { 859 // 1. This loop may have been omitted from vectorization for various 860 // reasons (e.g. due to the performance model or pattern depth > vector 861 // size). 862 auto it = state->strategy->loopToVectorDim.find(loop.getOperation()); 863 if (it == state->strategy->loopToVectorDim.end()) 864 continue; 865 866 // 2. Actual inner-to-outer transformation. 867 auto vectorDim = it->second; 868 assert(vectorDim < state->strategy->vectorSizes.size() && 869 "vector dim overflow"); 870 // a. get actual vector size 871 auto vectorSize = state->strategy->vectorSizes[vectorDim]; 872 // b. loop transformation for early vectorization is still subject to 873 // exploratory tradeoffs (see top of the file). Apply coarsening, 874 // i.e.: 875 // | ub -> ub 876 // | step -> step * vectorSize 877 LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize 878 << " : \n" 879 << loop); 880 if (failed( 881 vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state))) 882 return failure(); 883 } // end for. 884 } 885 886 return success(); 887 } 888 889 /// Tries to transform a scalar constant into a vector splat of that constant. 890 /// Returns the vectorized splat operation if the constant is a valid vector 891 /// element type. 892 /// If `type` is not a valid vector type or if the scalar constant is not a 893 /// valid vector element type, returns nullptr. 894 static Value vectorizeConstant(Operation *op, ConstantOp constant, Type type) { 895 if (!type || !type.isa<VectorType>() || 896 !VectorType::isValidElementType(constant.getType())) { 897 return nullptr; 898 } 899 OpBuilder b(op); 900 Location loc = op->getLoc(); 901 auto vectorType = type.cast<VectorType>(); 902 auto attr = DenseElementsAttr::get(vectorType, constant.getValue()); 903 auto *constantOpInst = constant.getOperation(); 904 905 OperationState state(loc, constantOpInst->getName().getStringRef(), {}, 906 {vectorType}, {b.getNamedAttr("value", attr)}); 907 908 return b.createOperation(state)->getResult(0); 909 } 910 911 /// Returns the vector type resulting from applying the provided vectorization 912 /// strategy on the scalar type. 913 static VectorType getVectorType(Type scalarTy, 914 const VectorizationStrategy *strategy) { 915 assert(!scalarTy.isa<VectorType>() && "Expected scalar type"); 916 return VectorType::get(strategy->vectorSizes, scalarTy); 917 } 918 919 /// Returns true if the provided value is vector uniform given the vectorization 920 /// strategy. 921 // TODO: For now, only values that are invariants to all the loops in the 922 // vectorization strategy are considered vector uniforms. 923 static bool isUniformDefinition(Value value, 924 const VectorizationStrategy *strategy) { 925 for (auto loopToDim : strategy->loopToVectorDim) { 926 auto loop = cast<AffineForOp>(loopToDim.first); 927 if (!loop.isDefinedOutsideOfLoop(value)) 928 return false; 929 } 930 return true; 931 } 932 933 /// Generates a broadcast op for the provided uniform value using the 934 /// vectorization strategy in 'state'. 935 static Value vectorizeUniform(Value value, VectorizationState *state) { 936 OpBuilder builder(value.getContext()); 937 builder.setInsertionPointAfterValue(value); 938 939 auto vectorTy = getVectorType(value.getType(), state->strategy); 940 auto bcast = builder.create<BroadcastOp>(value.getLoc(), vectorTy, value); 941 942 // Add broadcast to the replacement map to reuse it for other uses. 943 state->replacementMap[value] = bcast; 944 return bcast; 945 } 946 947 /// Tries to vectorize a given operand `op` of Operation `op` during 948 /// def-chain propagation or during terminal vectorization, by applying the 949 /// following logic: 950 /// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized 951 /// useby -def propagation), `op` is already in the proper vector form; 952 /// 2. otherwise, the `op` may be in some other vector form that fails to 953 /// vectorize atm (i.e. broadcasting required), returns nullptr to indicate 954 /// failure; 955 /// 3. if the `op` is a constant, returns the vectorized form of the constant; 956 /// 4. if the `op` is uniform, returns a vector broadcast of the `op`; 957 /// 5. non-constant scalars are currently non-vectorizable, in particular to 958 /// guard against vectorizing an index which may be loop-variant and needs 959 /// special handling. 960 /// 961 /// In particular this logic captures some of the use cases where definitions 962 /// that are not scoped under the current pattern are needed to vectorize. 963 /// One such example is top level function constants that need to be splatted. 964 /// 965 /// Returns an operand that has been vectorized to match `state`'s strategy if 966 /// vectorization is possible with the above logic. Returns nullptr otherwise. 967 /// 968 /// TODO: handle more complex cases. 969 static Value vectorizeOperand(Value operand, Operation *op, 970 VectorizationState *state) { 971 LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: " << operand); 972 // 1. If this value has already been vectorized this round, we are done. 973 if (state->vectorizedSet.count(operand.getDefiningOp()) > 0) { 974 LLVM_DEBUG(dbgs() << " -> already vector operand"); 975 return operand; 976 } 977 // 1.b. Delayed on-demand replacement of a use. 978 // Note that we cannot just call replaceAllUsesWith because it may result 979 // in ops with mixed types, for ops whose operands have not all yet 980 // been vectorized. This would be invalid IR. 981 auto it = state->replacementMap.find(operand); 982 if (it != state->replacementMap.end()) { 983 auto res = it->second; 984 LLVM_DEBUG(dbgs() << "-> delayed replacement by: " << res); 985 return res; 986 } 987 // 2. TODO: broadcast needed. 988 if (operand.getType().isa<VectorType>()) { 989 LLVM_DEBUG(dbgs() << "-> non-vectorizable"); 990 return nullptr; 991 } 992 // 3. vectorize constant. 993 if (auto constant = operand.getDefiningOp<ConstantOp>()) 994 return vectorizeConstant(op, constant, 995 getVectorType(operand.getType(), state->strategy)); 996 997 // 4. Uniform values. 998 if (isUniformDefinition(operand, state->strategy)) 999 return vectorizeUniform(operand, state); 1000 1001 // 5. currently non-vectorizable. 1002 LLVM_DEBUG(dbgs() << "-> non-vectorizable: " << operand); 1003 return nullptr; 1004 } 1005 1006 /// Encodes Operation-specific behavior for vectorization. In general we assume 1007 /// that all operands of an op must be vectorized but this is not always true. 1008 /// In the future, it would be nice to have a trait that describes how a 1009 /// particular operation vectorizes. For now we implement the case distinction 1010 /// here. 1011 /// Returns a vectorized form of an operation or nullptr if vectorization fails. 1012 // TODO: consider adding a trait to Op to describe how it gets vectorized. 1013 // Maybe some Ops are not vectorizable or require some tricky logic, we cannot 1014 // do one-off logic here; ideally it would be TableGen'd. 1015 static Operation *vectorizeOneOperation(Operation *opInst, 1016 VectorizationState *state) { 1017 // Sanity checks. 1018 assert(!isa<AffineLoadOp>(opInst) && 1019 "all loads must have already been fully vectorized independently"); 1020 assert(!isa<vector::TransferReadOp>(opInst) && 1021 "vector.transfer_read cannot be further vectorized"); 1022 assert(!isa<vector::TransferWriteOp>(opInst) && 1023 "vector.transfer_write cannot be further vectorized"); 1024 1025 if (auto store = dyn_cast<AffineStoreOp>(opInst)) { 1026 OpBuilder b(opInst); 1027 auto memRef = store.getMemRef(); 1028 auto value = store.getValueToStore(); 1029 auto vectorValue = vectorizeOperand(value, opInst, state); 1030 if (!vectorValue) 1031 return nullptr; 1032 1033 ValueRange mapOperands = store.getMapOperands(); 1034 SmallVector<Value, 8> indices; 1035 indices.reserve(store.getMemRefType().getRank()); 1036 if (store.getAffineMap() != 1037 b.getMultiDimIdentityMap(store.getMemRefType().getRank())) { 1038 computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands, 1039 indices); 1040 } else { 1041 indices.append(mapOperands.begin(), mapOperands.end()); 1042 } 1043 1044 auto permutationMap = 1045 makePermutationMap(opInst, indices, state->strategy->loopToVectorDim); 1046 if (!permutationMap) 1047 return nullptr; 1048 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); 1049 LLVM_DEBUG(permutationMap.print(dbgs())); 1050 auto transfer = b.create<vector::TransferWriteOp>( 1051 opInst->getLoc(), vectorValue, memRef, indices, permutationMap); 1052 auto *res = transfer.getOperation(); 1053 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res); 1054 // "Terminals" (i.e. AffineStoreOps) are erased on the spot. 1055 opInst->erase(); 1056 return res; 1057 } 1058 if (opInst->getNumRegions() != 0) 1059 return nullptr; 1060 1061 SmallVector<Type, 8> vectorTypes; 1062 for (auto v : opInst->getResults()) { 1063 vectorTypes.push_back( 1064 VectorType::get(state->strategy->vectorSizes, v.getType())); 1065 } 1066 SmallVector<Value, 8> vectorOperands; 1067 for (auto v : opInst->getOperands()) { 1068 vectorOperands.push_back(vectorizeOperand(v, opInst, state)); 1069 } 1070 // Check whether a single operand is null. If so, vectorization failed. 1071 bool success = llvm::all_of(vectorOperands, [](Value op) { return op; }); 1072 if (!success) { 1073 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize"); 1074 return nullptr; 1075 } 1076 1077 // Create a clone of the op with the proper operands and return types. 1078 // TODO: The following assumes there is always an op with a fixed 1079 // name that works both in scalar mode and vector mode. 1080 // TODO: Is it worth considering an Operation.clone operation which 1081 // changes the type so we can promote an Operation with less boilerplate? 1082 OpBuilder b(opInst); 1083 OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(), 1084 vectorOperands, vectorTypes, opInst->getAttrs(), 1085 /*successors=*/{}, /*regions=*/{}); 1086 return b.createOperation(newOp); 1087 } 1088 1089 /// Iterates over the forward slice from the loads in the vectorization pattern 1090 /// and rewrites them using their vectorized counterpart by: 1091 /// 1. Create the forward slice starting from the loads in the vectorization 1092 /// pattern. 1093 /// 2. Topologically sorts the forward slice. 1094 /// 3. For each operation in the slice, create the vector form of this 1095 /// operation, replacing each operand by a replacement operands retrieved from 1096 /// replacementMap. If any such replacement is missing, vectorization fails. 1097 static LogicalResult vectorizeNonTerminals(VectorizationState *state) { 1098 // 1. create initial worklist with the uses of the roots. 1099 SetVector<Operation *> worklist; 1100 // Note: state->roots have already been vectorized and must not be vectorized 1101 // again. This fits `getForwardSlice` which does not insert `op` in the 1102 // result. 1103 // Note: we have to exclude terminals because some of their defs may not be 1104 // nested under the vectorization pattern (e.g. constants defined in an 1105 // encompassing scope). 1106 // TODO: Use a backward slice for terminals, avoid special casing and 1107 // merge implementations. 1108 for (auto *op : state->roots) { 1109 getForwardSlice(op, &worklist, [state](Operation *op) { 1110 return state->terminals.count(op) == 0; // propagate if not terminal 1111 }); 1112 } 1113 // We merged multiple slices, topological order may not hold anymore. 1114 worklist = topologicalSort(worklist); 1115 1116 for (unsigned i = 0; i < worklist.size(); ++i) { 1117 auto *op = worklist[i]; 1118 LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: "); 1119 LLVM_DEBUG(op->print(dbgs())); 1120 1121 // Create vector form of the operation. 1122 // Insert it just before op, on success register op as replaced. 1123 auto *vectorizedInst = vectorizeOneOperation(op, state); 1124 if (!vectorizedInst) { 1125 return failure(); 1126 } 1127 1128 // 3. Register replacement for future uses in the scope. 1129 // Note that we cannot just call replaceAllUsesWith because it may 1130 // result in ops with mixed types, for ops whose operands have not all 1131 // yet been vectorized. This would be invalid IR. 1132 state->registerReplacement(op, vectorizedInst); 1133 } 1134 return success(); 1135 } 1136 1137 /// Recursive implementation to convert all the nested loops in 'match' to a 2D 1138 /// vector container that preserves the relative nesting level of each loop with 1139 /// respect to the others in 'match'. 'currentLevel' is the nesting level that 1140 /// will be assigned to the loop in the current 'match'. 1141 static void 1142 getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel, 1143 std::vector<SmallVector<AffineForOp, 2>> &loops) { 1144 // Add a new empty level to the output if it doesn't exist already. 1145 assert(currentLevel <= loops.size() && "Unexpected currentLevel"); 1146 if (currentLevel == loops.size()) 1147 loops.push_back(SmallVector<AffineForOp, 2>()); 1148 1149 // Add current match and recursively visit its children. 1150 loops[currentLevel].push_back(cast<AffineForOp>(match.getMatchedOperation())); 1151 for (auto childMatch : match.getMatchedChildren()) { 1152 getMatchedAffineLoopsRec(childMatch, currentLevel + 1, loops); 1153 } 1154 } 1155 1156 /// Converts all the nested loops in 'match' to a 2D vector container that 1157 /// preserves the relative nesting level of each loop with respect to the others 1158 /// in 'match'. This means that every loop in 'loops[i]' will have a parent loop 1159 /// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in 1160 /// 'loops[i+1]'. 1161 static void 1162 getMatchedAffineLoops(NestedMatch match, 1163 std::vector<SmallVector<AffineForOp, 2>> &loops) { 1164 getMatchedAffineLoopsRec(match, /*currLoopDepth=*/0, loops); 1165 } 1166 1167 /// Internal implementation to vectorize affine loops from a single loop nest 1168 /// using an n-D vectorization strategy. 1169 static LogicalResult 1170 vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops, 1171 const VectorizationStrategy &strategy) { 1172 assert(loops[0].size() == 1 && "Expected single root loop"); 1173 AffineForOp rootLoop = loops[0][0]; 1174 OperationFolder folder(rootLoop.getContext()); 1175 VectorizationState state; 1176 state.strategy = &strategy; 1177 state.folder = &folder; 1178 1179 // Since patterns are recursive, they can very well intersect. 1180 // Since we do not want a fully greedy strategy in general, we decouple 1181 // pattern matching, from profitability analysis, from application. 1182 // As a consequence we must check that each root pattern is still 1183 // vectorizable. If a pattern is not vectorizable anymore, we just skip it. 1184 // TODO: implement a non-greedy profitability analysis that keeps only 1185 // non-intersecting patterns. 1186 if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) { 1187 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable"); 1188 return failure(); 1189 } 1190 1191 /// Sets up error handling for this root loop. This is how the root match 1192 /// maintains a clone for handling failure and restores the proper state via 1193 /// RAII. 1194 auto *loopInst = rootLoop.getOperation(); 1195 OpBuilder builder(loopInst); 1196 auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst)); 1197 struct Guard { 1198 LogicalResult failure() { 1199 loop.getInductionVar().replaceAllUsesWith(clonedLoop.getInductionVar()); 1200 loop.erase(); 1201 return mlir::failure(); 1202 } 1203 LogicalResult success() { 1204 clonedLoop.erase(); 1205 return mlir::success(); 1206 } 1207 AffineForOp loop; 1208 AffineForOp clonedLoop; 1209 } guard{rootLoop, clonedLoop}; 1210 1211 ////////////////////////////////////////////////////////////////////////////// 1212 // Start vectorizing. 1213 // From now on, any error triggers the scope guard above. 1214 ////////////////////////////////////////////////////////////////////////////// 1215 // 1. Vectorize all the loop candidates, in inner-to-outer order. 1216 // This also vectorizes the roots (AffineLoadOp) as well as registers the 1217 // terminals (AffineStoreOp) for post-processing vectorization (we need to 1218 // wait for all use-def chains into them to be vectorized first). 1219 if (failed(vectorizeLoopsAndLoads(loops, &state))) { 1220 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop"); 1221 return guard.failure(); 1222 } 1223 1224 // 2. Vectorize operations reached by use-def chains from root except the 1225 // terminals (store operations) that need to be post-processed separately. 1226 // TODO: add more as we expand. 1227 if (failed(vectorizeNonTerminals(&state))) { 1228 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals"); 1229 return guard.failure(); 1230 } 1231 1232 // 3. Post-process terminals. 1233 // Note: we have to post-process terminals because some of their defs may not 1234 // be nested under the vectorization pattern (e.g. constants defined in an 1235 // encompassing scope). 1236 // TODO: Use a backward slice for terminals, avoid special casing and 1237 // merge implementations. 1238 for (auto *op : state.terminals) { 1239 if (!vectorizeOneOperation(op, &state)) { // nullptr == failure 1240 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals"); 1241 return guard.failure(); 1242 } 1243 } 1244 1245 // 4. Finish this vectorization pattern. 1246 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern"); 1247 state.finishVectorizationPattern(); 1248 return guard.success(); 1249 } 1250 1251 /// Vectorization is a recursive procedure where anything below can fail. The 1252 /// root match thus needs to maintain a clone for handling failure. Each root 1253 /// may succeed independently but will otherwise clean after itself if anything 1254 /// below it fails. 1255 static LogicalResult vectorizeRootMatch(NestedMatch m, 1256 const VectorizationStrategy &strategy) { 1257 std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize; 1258 getMatchedAffineLoops(m, loopsToVectorize); 1259 return vectorizeLoopNest(loopsToVectorize, strategy); 1260 } 1261 1262 /// Internal implementation to vectorize affine loops in 'loops' using the n-D 1263 /// vectorization factors in 'vectorSizes'. By default, each vectorization 1264 /// factor is applied inner-to-outer to the loops of each loop nest. 1265 /// 'fastestVaryingPattern' can be optionally used to provide a different loop 1266 /// vectorization order. 1267 static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops, 1268 ArrayRef<int64_t> vectorSizes, 1269 ArrayRef<int64_t> fastestVaryingPattern) { 1270 for (auto &pat : 1271 makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) { 1272 LLVM_DEBUG(dbgs() << "\n******************************************"); 1273 LLVM_DEBUG(dbgs() << "\n******************************************"); 1274 LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on parent op\n"); 1275 LLVM_DEBUG(parentOp->print(dbgs())); 1276 1277 unsigned patternDepth = pat.getDepth(); 1278 1279 SmallVector<NestedMatch, 8> matches; 1280 pat.match(parentOp, &matches); 1281 // Iterate over all the top-level matches and vectorize eagerly. 1282 // This automatically prunes intersecting matches. 1283 for (auto m : matches) { 1284 VectorizationStrategy strategy; 1285 // TODO: depending on profitability, elect to reduce the vector size. 1286 strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end()); 1287 if (failed(analyzeProfitability(m.getMatchedChildren(), 1, patternDepth, 1288 &strategy))) { 1289 continue; 1290 } 1291 vectorizeLoopIfProfitable(m.getMatchedOperation(), 0, patternDepth, 1292 &strategy); 1293 // TODO: if pattern does not apply, report it; alter the 1294 // cost/benefit. 1295 vectorizeRootMatch(m, strategy); 1296 // TODO: some diagnostics if failure to vectorize occurs. 1297 } 1298 } 1299 LLVM_DEBUG(dbgs() << "\n"); 1300 } 1301 1302 std::unique_ptr<OperationPass<FuncOp>> 1303 createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) { 1304 return std::make_unique<Vectorize>(virtualVectorSize); 1305 } 1306 std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() { 1307 return std::make_unique<Vectorize>(); 1308 } 1309 1310 /// Applies vectorization to the current function by searching over a bunch of 1311 /// predetermined patterns. 1312 void Vectorize::runOnFunction() { 1313 FuncOp f = getFunction(); 1314 if (!fastestVaryingPattern.empty() && 1315 fastestVaryingPattern.size() != vectorSizes.size()) { 1316 f.emitRemark("Fastest varying pattern specified with different size than " 1317 "the vector size."); 1318 return signalPassFailure(); 1319 } 1320 1321 DenseSet<Operation *> parallelLoops; 1322 f.walk([¶llelLoops](AffineForOp loop) { 1323 if (isLoopParallel(loop)) 1324 parallelLoops.insert(loop); 1325 }); 1326 1327 // Thread-safe RAII local context, BumpPtrAllocator freed on exit. 1328 NestedPatternContext mlContext; 1329 vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern); 1330 } 1331 1332 /// Verify that affine loops in 'loops' meet the nesting criteria expected by 1333 /// SuperVectorizer: 1334 /// * There must be at least one loop. 1335 /// * There must be a single root loop (nesting level 0). 1336 /// * Each loop at a given nesting level must be nested in a loop from a 1337 /// previous nesting level. 1338 static void 1339 verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) { 1340 assert(!loops.empty() && "Expected at least one loop"); 1341 assert(!loops[0].size() && "Expected only one root loop"); 1342 1343 // Traverse loops outer-to-inner to check some invariants. 1344 for (int i = 1, end = loops.size(); i < end; ++i) { 1345 for (AffineForOp loop : loops[i]) { 1346 // Check that each loop at this level is nested in one of the loops from 1347 // the previous level. 1348 bool parentFound = false; 1349 for (AffineForOp maybeParent : loops[i - 1]) { 1350 if (maybeParent.getOperation()->isProperAncestor(loop)) { 1351 parentFound = true; 1352 break; 1353 } 1354 } 1355 assert(parentFound && "Child loop not nested in any parent loop"); 1356 1357 // Check that each loop at this level is not nested in another loop from 1358 // this level. 1359 for (AffineForOp sibling : loops[i]) 1360 assert(!sibling.getOperation()->isProperAncestor(loop) && 1361 "Loops at the same level are nested"); 1362 } 1363 } 1364 } 1365 1366 namespace mlir { 1367 1368 /// External utility to vectorize affine loops in 'loops' using the n-D 1369 /// vectorization factors in 'vectorSizes'. By default, each vectorization 1370 /// factor is applied inner-to-outer to the loops of each loop nest. 1371 /// 'fastestVaryingPattern' can be optionally used to provide a different loop 1372 /// vectorization order. 1373 void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops, 1374 ArrayRef<int64_t> vectorSizes, 1375 ArrayRef<int64_t> fastestVaryingPattern) { 1376 // Thread-safe RAII local context, BumpPtrAllocator freed on exit. 1377 NestedPatternContext mlContext; 1378 vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern); 1379 } 1380 1381 /// External utility to vectorize affine loops from a single loop nest using an 1382 /// n-D vectorization strategy (see doc in VectorizationStrategy definition). 1383 /// Loops are provided in a 2D vector container. The first dimension represents 1384 /// the nesting level relative to the loops to be vectorized. The second 1385 /// dimension contains the loops. This means that: 1386 /// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]', 1387 /// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'. 1388 /// 1389 /// For example, for the following loop nest: 1390 /// 1391 /// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>, 1392 /// %out0: memref<64x128x512xf32>, 1393 /// %out1: memref<64x128x128xf32>) { 1394 /// affine.for %i0 = 0 to 64 { 1395 /// affine.for %i1 = 0 to 128 { 1396 /// affine.for %i2 = 0 to 512 { 1397 /// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32> 1398 /// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32> 1399 /// } 1400 /// affine.for %i3 = 0 to 128 { 1401 /// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32> 1402 /// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32> 1403 /// } 1404 /// } 1405 /// } 1406 /// return 1407 /// } 1408 /// 1409 /// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two 1410 /// innermost loops; 1411 /// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost 1412 /// loops; 1413 /// loops = {{%i2}}, to vectorize only the first innermost loop; 1414 /// loops = {{%i3}}, to vectorize only the second innermost loop; 1415 /// loops = {{%i1}}, to vectorize only the middle loop. 1416 LogicalResult 1417 vectorizeAffineLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops, 1418 const VectorizationStrategy &strategy) { 1419 // Thread-safe RAII local context, BumpPtrAllocator freed on exit. 1420 NestedPatternContext mlContext; 1421 verifyLoopNesting(loops); 1422 return vectorizeLoopNest(loops, strategy); 1423 } 1424 1425 std::unique_ptr<OperationPass<FuncOp>> 1426 createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) { 1427 return std::make_unique<Vectorize>(virtualVectorSize); 1428 } 1429 std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() { 1430 return std::make_unique<Vectorize>(); 1431 } 1432 1433 } // namespace mlir 1434