1 //===- Sparsification.cpp - Implementation of sparsification --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements converting sparse tensor types to actual sparse code. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "CodegenUtils.h" 14 15 #include "mlir/Dialect/Affine/IR/AffineOps.h" 16 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 17 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" 18 #include "mlir/Dialect/Bufferization/IR/Bufferization.h" 19 #include "mlir/Dialect/Func/IR/FuncOps.h" 20 #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 21 #include "mlir/Dialect/Linalg/IR/Linalg.h" 22 #include "mlir/Dialect/Linalg/Utils/Utils.h" 23 #include "mlir/Dialect/MemRef/IR/MemRef.h" 24 #include "mlir/Dialect/SCF/IR/SCF.h" 25 #include "mlir/Dialect/SCF/Transforms/Transforms.h" 26 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" 27 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" 28 #include "mlir/Dialect/SparseTensor/Utils/Merger.h" 29 #include "mlir/Dialect/Vector/IR/VectorOps.h" 30 #include "mlir/IR/Matchers.h" 31 #include "mlir/IR/TensorEncoding.h" 32 #include "llvm/ADT/SmallBitVector.h" 33 34 using namespace mlir; 35 using namespace mlir::sparse_tensor; 36 37 //===----------------------------------------------------------------------===// 38 // Declarations of data structures. 39 //===----------------------------------------------------------------------===// 40 41 namespace { 42 43 // Iteration graph sorting. 44 enum SortMask { 45 kSparseOnly = 0x0, 46 kIncludeDense = 0x1, 47 kIncludeUndef = 0x2, 48 kIncludeAll = 0x3 49 }; 50 51 // Reduction kinds. 52 enum Reduction { kNoReduc, kSum, kProduct, kAnd, kOr, kXor }; 53 54 // Code generation. 55 struct CodeGen { 56 CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops, 57 OpOperand *op, unsigned nest) 58 : options(o), loops(numLoops), sizes(numLoops), buffers(numTensors), 59 pointers(numTensors, std::vector<Value>(numLoops)), 60 indices(numTensors, std::vector<Value>(numLoops)), 61 highs(numTensors, std::vector<Value>(numLoops)), 62 pidxs(numTensors, std::vector<Value>(numLoops)), 63 idxs(numTensors, std::vector<Value>(numLoops)), redVal(), sparseOut(op), 64 outerParNest(nest), lexIdx(), lexVal(), expValues(), expFilled(), 65 expAdded(), expCount(), curVecMask() {} 66 /// Sparsification options. 67 SparsificationOptions options; 68 /// Universal dense indices and upper bounds (by index). The loops array 69 /// is updated with the value of the universal dense index in the current 70 /// loop. The sizes array is set once with the inferred dimension sizes. 71 std::vector<Value> loops; 72 std::vector<Value> sizes; 73 /// Buffers for storing dense and sparse numerical values (by tensor). 74 /// This array is set once during bufferization of all tensors. 75 std::vector<Value> buffers; 76 /// Sparse storage schemes (1-D): pointers and indices (by tensor and index). 77 /// This array is set once during bufferization of all sparse tensors. 78 std::vector<std::vector<Value>> pointers; 79 std::vector<std::vector<Value>> indices; 80 /// Sparse iteration information (by tensor and index). These arrays 81 /// are updated to remain current within the current loop. 82 std::vector<std::vector<Value>> highs; 83 std::vector<std::vector<Value>> pidxs; 84 std::vector<std::vector<Value>> idxs; 85 /// Current reduction, updated during code generation. When indices of a 86 /// reduction are exhausted, all inner loops can use a scalarized reduction. 87 unsigned redExp = -1u; 88 Value redVal; 89 Reduction redKind = kNoReduc; 90 // Sparse tensor as output. Implemented either through direct injective 91 // insertion in lexicographic index order (where indices are updated 92 // in the temporary array `lexIdx`) or through access pattern expansion 93 // in the innermost loop nest (`expValues` through `expCount`). 94 OpOperand *sparseOut; 95 unsigned outerParNest; 96 Value lexIdx; 97 Value lexVal; 98 Value expValues; 99 Value expFilled; 100 Value expAdded; 101 Value expCount; 102 // Current vector length and mask. 103 unsigned curVecLength = 1; 104 Value curVecMask; 105 }; 106 107 } // namespace 108 109 //===----------------------------------------------------------------------===// 110 // Sparse compiler analysis methods. 111 //===----------------------------------------------------------------------===// 112 113 /// Helper method to construct a permuted dimension ordering 114 /// that adheres to the given topological sort. 115 static AffineMap permute(MLIRContext *context, AffineMap m, 116 std::vector<unsigned> &topSort) { 117 unsigned sz = topSort.size(); 118 assert(m.getNumResults() == sz && "TopoSort/AffineMap size mismatch"); 119 // Construct the inverse of `m`; to avoid the asymptotic complexity 120 // of calling `m.getPermutedPosition` repeatedly. 121 SmallVector<unsigned, 4> inv(sz); 122 for (unsigned i = 0; i < sz; i++) 123 inv[i] = m.getDimPosition(i); 124 // Construct the permutation. 125 SmallVector<unsigned, 4> perm(sz); 126 for (unsigned i = 0; i < sz; i++) 127 perm[i] = inv[topSort[i]]; 128 return AffineMap::getPermutationMap(perm, context); 129 } 130 131 /// Helper method to apply dimension ordering permutation. 132 static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d) { 133 if (enc) { 134 auto order = enc.getDimOrdering(); 135 if (order) { 136 assert(order.isPermutation()); 137 return order.getDimPosition(d); 138 } 139 } 140 return d; 141 } 142 143 /// Helper method to translate dim level type to internal representation. 144 static Dim toDim(const SparseTensorEncodingAttr &enc, unsigned d) { 145 if (enc) { 146 SparseTensorEncodingAttr::DimLevelType tp = enc.getDimLevelType()[d]; 147 if (tp == SparseTensorEncodingAttr::DimLevelType::Compressed) 148 return Dim::kSparse; 149 if (tp == SparseTensorEncodingAttr::DimLevelType::Singleton) 150 return Dim::kSingle; 151 } 152 return Dim::kDense; 153 } 154 155 /// Helper method to inspect affine expressions. Rejects cases where the 156 /// same index is used more than once. Also rejects affine expressions 157 /// that are not a direct index for annotated tensors. 158 // TODO: accept more affine cases for sparse tensors 159 static bool findAffine(Merger &merger, unsigned tensor, AffineExpr a, Dim dim, 160 bool isDense) { 161 switch (a.getKind()) { 162 case AffineExprKind::DimId: { 163 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 164 if (!merger.isDim(tensor, idx, Dim::kUndef)) 165 return false; // used more than once 166 merger.setDim(tensor, idx, dim); 167 return true; 168 } 169 case AffineExprKind::Add: 170 case AffineExprKind::Mul: { 171 if (!isDense) 172 return false; 173 auto binOp = a.cast<AffineBinaryOpExpr>(); 174 return findAffine(merger, tensor, binOp.getLHS(), dim, isDense) && 175 findAffine(merger, tensor, binOp.getRHS(), dim, isDense); 176 } 177 case AffineExprKind::Constant: 178 return isDense; 179 default: 180 return false; 181 } 182 } 183 184 /// Helper method to inspect sparse encodings in the tensor types. 185 /// Fills the per-dimension sparsity information for all tensors. 186 /// Returns true if the sparse annotations and affine subscript 187 /// expressions of all tensors are admissable. Returns false if 188 /// no annotations are found or inadmissable constructs occur. 189 static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) { 190 bool annotated = false; 191 for (OpOperand *t : op.getInputAndOutputOperands()) { 192 auto map = op.getTiedIndexingMap(t); 193 auto enc = getSparseTensorEncoding(t->get().getType()); 194 if (enc) 195 annotated = true; 196 assert(map.getNumResults() == op.getRank(t)); 197 for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { 198 unsigned tensor = t->getOperandNumber(); 199 AffineExpr a = map.getResult(perm(enc, d)); 200 if (!findAffine(merger, tensor, a, toDim(enc, d), !enc)) 201 return false; // inadmissable affine expression 202 } 203 } 204 return annotated; 205 } 206 207 /// A DFS helper to compute a topological sort. Note that recursion is 208 /// bounded by the number of implicit loops, which is always small. 209 /// Returns false when a cycle is detected. 210 static bool topSortDFS(unsigned i, std::vector<unsigned> &visit, 211 std::vector<unsigned> &topSort, 212 std::vector<std::vector<bool>> &adjM) { 213 if (visit[i] != 0) 214 return visit[i] != 1; // 1 denotes cycle! 215 visit[i] = 1; 216 for (unsigned j = 0, e = visit.size(); j < e; j++) 217 if (adjM[i][j]) 218 if (!topSortDFS(j, visit, topSort, adjM)) 219 return false; 220 visit[i] = 2; 221 topSort.push_back(i); 222 return true; 223 } 224 225 /// Helper method to add all constraints from the indices in one affine 226 /// expression before all indices in the other affine expression. For 227 /// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3. 228 static void addAffineOrderings(std::vector<std::vector<bool>> &adjM, 229 AffineExpr a, AffineExpr b, unsigned fidx) { 230 switch (a.getKind()) { 231 case AffineExprKind::DimId: { 232 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 233 if (b) 234 addAffineOrderings(adjM, b, AffineExpr(), idx); 235 else 236 adjM[fidx][idx] = true; 237 break; 238 } 239 case AffineExprKind::Add: 240 case AffineExprKind::Mul: { 241 auto binOp = a.cast<AffineBinaryOpExpr>(); 242 addAffineOrderings(adjM, binOp.getLHS(), b, fidx); 243 addAffineOrderings(adjM, binOp.getRHS(), b, fidx); 244 break; 245 } 246 default: 247 break; 248 } 249 } 250 251 /// Computes a topologically sorted iteration graph for the linalg operation. 252 /// Ensures all tensors are visited in natural index order. This is essential 253 /// for sparse storage formats since these only support access along fixed 254 /// dimensions. Even for dense storage formats, however, the natural index 255 /// order yields innermost unit-stride access with better spatial locality. 256 static bool computeIterationGraph(Merger &merger, linalg::GenericOp op, 257 std::vector<unsigned> &topSort, unsigned mask, 258 OpOperand *skip = nullptr) { 259 // Set up an n x n from/to adjacency matrix of the iteration graph 260 // for the implicit loop indices i_0 .. i_n-1. 261 unsigned n = op.getNumLoops(); 262 std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false)); 263 264 // Iterate over the indexing maps of every tensor in the tensor expression. 265 for (OpOperand *t : op.getInputAndOutputOperands()) { 266 // Skip tensor during cycle resolution. 267 if (t == skip) 268 continue; 269 // Get map and encoding. 270 auto map = op.getTiedIndexingMap(t); 271 auto enc = getSparseTensorEncoding(t->get().getType()); 272 assert(map.getNumDims() == n); 273 // Skip dense tensor constraints when not requested. 274 if (!(mask & SortMask::kIncludeDense) && !enc) 275 continue; 276 // Each tensor expression and optional dimension ordering (row-major 277 // by default) puts an ordering constraint on the loop indices. For 278 // example, the tensor expresion A_ijk forces the ordering i < j < k 279 // on the loop indices if no explicit dimension ordering is given. 280 for (unsigned d = 1, rank = map.getNumResults(); d < rank; d++) { 281 AffineExpr f = map.getResult(perm(enc, d - 1)); 282 AffineExpr t = map.getResult(perm(enc, d)); 283 addAffineOrderings(adjM, f, t, 0); 284 } 285 // Push unrelated loops into sparse iteration space, so these 286 // will be skipped more often. 287 if (mask & SortMask::kIncludeUndef) { 288 unsigned tensor = t->getOperandNumber(); 289 for (unsigned i = 0; i < n; i++) 290 if (merger.isDim(tensor, i, Dim::kSparse)) 291 for (unsigned j = 0; j < n; j++) 292 if (merger.isDim(tensor, j, Dim::kUndef)) 293 adjM[i][j] = true; 294 } 295 } 296 297 // Topologically sort the iteration graph to determine loop order. 298 // Report failure for a cyclic iteration graph. 299 topSort.clear(); 300 topSort.reserve(n); 301 std::vector<unsigned> visit(n, 0); 302 for (unsigned i = 0; i < n; i++) 303 if (visit[i] == 0) 304 if (!topSortDFS(i, visit, topSort, adjM)) 305 return false; // cycle! 306 std::reverse(std::begin(topSort), std::end(topSort)); 307 return true; 308 } 309 310 /// Returns true if tensor has an in-place annotation. 311 static bool isInPlace(Value val) { 312 if (auto arg = val.dyn_cast<BlockArgument>()) 313 if (auto funcOp = dyn_cast<func::FuncOp>(arg.getOwner()->getParentOp())) 314 if (auto attr = funcOp.getArgAttrOfType<BoolAttr>( 315 arg.getArgNumber(), 316 bufferization::BufferizableOpInterface::kInplaceableAttrName)) 317 return attr.getValue(); 318 return false; 319 } 320 321 /// Returns true if tensor materializes uninitialized into the computation. 322 static bool isMaterializing(Value val) { 323 return val.getDefiningOp<linalg::InitTensorOp>() || 324 val.getDefiningOp<bufferization::AllocTensorOp>(); 325 } 326 327 /// Returns true when the tensor expression is admissable for codegen. 328 /// Since all sparse input tensors are admissable, we just need to check 329 /// whether the out tensor in the tensor expression codegen is admissable. 330 /// Sets `sparseOut` to the tensor and `outerParNest` to the outer injective 331 /// nesting depth when a "truly dynamic" sparse tensor output occurs. 332 static bool isAdmissableTensorExp(Merger &merger, linalg::GenericOp op, 333 std::vector<unsigned> &topSort, unsigned exp, 334 OpOperand **sparseOut, 335 unsigned &outerParNest) { 336 OpOperand *lhs = op.getOutputOperand(0); 337 unsigned tensor = lhs->getOperandNumber(); 338 auto enc = getSparseTensorEncoding(lhs->get().getType()); 339 // An non-annotated output tensor is assumed dense, and becomes a random 340 // access n-dim memref. Admissable since insertions cannot occur. 341 if (!enc) 342 return true; 343 // An all-dense annotated "sparse" output tensor becomes a linearized random 344 // access 1-dim memref. Also admissable since insertions cannot occur. 345 bool allDense = true; 346 auto iteratorTypes = op.iterator_types().getValue(); 347 unsigned numLoops = iteratorTypes.size(); 348 for (unsigned i = 0; i < numLoops; i++) 349 if (merger.isDim(tensor, i, Dim::kSparse)) { 350 allDense = false; 351 break; 352 } 353 if (allDense) 354 return true; 355 // A tensor expression with a sparse output tensor that changes its values 356 // but not its nonzero structure, an operation called "simply dynamic" in 357 // [Bik96,Ch9], is also admissable without special codegen, provided 358 // the tensor's underlying sparse storage scheme can be modified in-place. 359 if (merger.isSingleCondition(tensor, exp) && isInPlace(lhs->get())) 360 return true; 361 // Accept "truly dynamic" if the output tensor materializes uninitialized 362 // into the computation and insertions occur in lexicographic index order. 363 if (isMaterializing(lhs->get())) { 364 unsigned nest = 0; 365 for (unsigned i = 0; i < numLoops; i++) { 366 if (isReductionIterator(iteratorTypes[topSort[i]])) 367 break; // terminate at first reduction 368 nest++; 369 } 370 // Determine admissable dynamic insertion situations: 371 // (1) fully injective, since there are no reductions, 372 // (2) admissable 1-d expansion in innermost dimension. 373 if (nest >= op.getRank(lhs) - 1) { 374 *sparseOut = lhs; 375 outerParNest = nest; 376 return true; 377 } 378 } 379 return false; 380 } 381 382 //===----------------------------------------------------------------------===// 383 // Sparse compiler synthesis methods (reductions). 384 //===----------------------------------------------------------------------===// 385 386 /// Maps reduction kind to vector::CombiningKind. 387 static vector::CombiningKind getCombiningKind(Reduction kind) { 388 switch (kind) { 389 case kNoReduc: 390 break; 391 case kSum: 392 return vector::CombiningKind::ADD; 393 case kProduct: 394 return vector::CombiningKind::MUL; 395 case kAnd: 396 return vector::CombiningKind::AND; 397 case kOr: 398 return vector::CombiningKind::OR; 399 case kXor: 400 return vector::CombiningKind::XOR; 401 } 402 llvm_unreachable("unknown reduction kind"); 403 } 404 405 /// Maps operation to reduction. 406 static Reduction getReduction(Kind kind) { 407 switch (kind) { 408 case Kind::kAddF: 409 case Kind::kAddC: 410 case Kind::kAddI: 411 case Kind::kSubF: 412 case Kind::kSubC: 413 case Kind::kSubI: 414 return kSum; 415 case Kind::kMulF: 416 case Kind::kMulC: 417 case Kind::kMulI: 418 return kProduct; 419 case Kind::kAndI: 420 return kAnd; 421 case Kind::kOrI: 422 return kOr; 423 case Kind::kXorI: 424 return kXor; 425 default: 426 llvm_unreachable("unexpected reduction operator"); 427 } 428 } 429 430 /// Generates an initial value for a vector reduction, following the scheme 431 /// given in Chapter 5 of "The Software Vectorization Handbook", where the 432 /// initial scalar value is correctly embedded in the vector reduction value, 433 /// and a straightforward horizontal reduction will complete the operation. 434 static Value genVectorReducInit(CodeGen &codegen, OpBuilder &builder, 435 Location loc, VectorType vtp) { 436 Value r = codegen.redVal; 437 switch (codegen.redKind) { 438 case kNoReduc: 439 break; 440 case kSum: 441 case kXor: 442 // Initialize reduction vector to: | 0 | .. | 0 | r | 443 return builder.create<vector::InsertElementOp>( 444 loc, r, constantZero(builder, loc, vtp), 445 constantIndex(builder, loc, 0)); 446 case kProduct: 447 // Initialize reduction vector to: | 1 | .. | 1 | r | 448 return builder.create<vector::InsertElementOp>( 449 loc, r, constantOne(builder, loc, vtp), constantIndex(builder, loc, 0)); 450 case kAnd: 451 case kOr: 452 // Initialize reduction vector to: | r | .. | r | r | 453 return builder.create<vector::BroadcastOp>(loc, vtp, r); 454 } 455 llvm_unreachable("unknown reduction kind"); 456 } 457 458 /// Generates final value for a vector reduction. 459 static Value genVectorReducEnd(CodeGen &codegen, OpBuilder &builder, 460 Location loc, VectorType vtp) { 461 vector::CombiningKind kind = getCombiningKind(codegen.redKind); 462 return builder.create<vector::ReductionOp>(loc, kind, codegen.redVal); 463 } 464 465 /// Updates scalarized reduction value. 466 static void updateReduc(Merger &merger, CodeGen &codegen, Value reduc) { 467 assert(codegen.redKind != kNoReduc); 468 codegen.redVal = merger.exp(codegen.redExp).val = reduc; 469 } 470 471 //===----------------------------------------------------------------------===// 472 // Sparse compiler synthesis methods (statements and expressions). 473 //===----------------------------------------------------------------------===// 474 475 /// Generates buffer for the output tensor. Note that all sparse kernels 476 /// assume that when all elements are written to (viz. x(i) = y(i) * z(i)), 477 /// the output buffer is already initialized to all zeroes and only nonzeroes 478 /// values are computed and written out. For updates (viz. x(i) += y(i) * z(i)), 479 /// only nonzeroes values are used for the updates and no assumption on the 480 /// original contents of the output buffer is necessary. 481 static Value genOutputBuffer(CodeGen &codegen, OpBuilder &builder, 482 linalg::GenericOp op, MemRefType denseTp, 483 ArrayRef<Value> args) { 484 Location loc = op.getLoc(); 485 OpOperand *lhs = op.getOutputOperand(0); 486 Value tensor = lhs->get(); 487 bool isInit = op.isInitTensor(lhs); 488 // An output tensor that is in-place can simply materialize from the buffer 489 // of the tensor that appears in the outs() clause. For updates, this has 490 // the advantage that only the nonzero value are involved in the computation, 491 // keeping the operation O(nnz). In all other cases, we are forced to zero 492 // out the buffer to enforce the assumption above, which may negatively 493 // impact running complexity (viz. O(n^2 + nnz) vs. O(nnz) for matrices). 494 // TODO: use better analysis to avoid zeroing out the buffer? 495 if (isInPlace(tensor)) { 496 Value init = 497 builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor); 498 if (!isInit) { 499 Value zero = constantZero(builder, loc, denseTp.getElementType()); 500 builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{init}); 501 } 502 return init; 503 } 504 // By default, a new buffer is allocated which is either set to zero (when 505 // no updates occur or the tensor materializes into this computation) or 506 // initialized to the value of the tensor defined in the outs() clause. 507 // This is always correct (since it enforces all assumptions above) but 508 // may negatively impact running complexity as explained above. 509 Value alloc = builder.create<memref::AllocOp>(loc, denseTp, args); 510 if (!isInit || isMaterializing(tensor)) { 511 Value zero = constantZero(builder, loc, denseTp.getElementType()); 512 builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{alloc}); 513 } else { 514 Value init = 515 builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor); 516 builder.create<memref::CopyOp>(loc, init, alloc); 517 } 518 return alloc; 519 } 520 521 /// Local bufferization of all dense and sparse data structures. 522 /// This code enables testing the first prototype sparse compiler. 523 // TODO: replace this with a proliferated bufferization strategy 524 static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder, 525 linalg::GenericOp op) { 526 Location loc = op.getLoc(); 527 assert(op.getNumInputsAndOutputs() == op.getNumInputs() + 1); 528 // For every tensor, find lower and upper bound on dimensions, set the 529 // same bounds on loop indices, and obtain dense or sparse buffer(s). 530 SmallVector<Value, 4> args; 531 for (OpOperand *t : op.getInputAndOutputOperands()) { 532 unsigned tensor = t->getOperandNumber(); 533 auto shape = op.getShape(t); 534 auto map = op.getTiedIndexingMap(t); 535 auto enc = getSparseTensorEncoding(t->get().getType()); 536 // Scan all dimensions of current tensor. 537 args.clear(); 538 for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { 539 AffineExpr a = map.getResult(perm(enc, d)); 540 if (a.getKind() != AffineExprKind::DimId) 541 continue; // compound 542 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 543 // Handle sparse storage schemes. 544 if (merger.isDim(tensor, idx, Dim::kSparse)) { 545 auto dynShape = {ShapedType::kDynamicSize}; 546 auto ptrTp = 547 MemRefType::get(dynShape, getPointerOverheadType(builder, enc)); 548 auto indTp = 549 MemRefType::get(dynShape, getIndexOverheadType(builder, enc)); 550 Value dim = constantIndex(builder, loc, d); 551 // Generate sparse primitives to obtains pointer and indices. 552 codegen.pointers[tensor][idx] = 553 builder.create<ToPointersOp>(loc, ptrTp, t->get(), dim); 554 codegen.indices[tensor][idx] = 555 builder.create<ToIndicesOp>(loc, indTp, t->get(), dim); 556 } 557 // Find upper bound in current dimension. 558 unsigned p = perm(enc, d); 559 Value up = linalg::createOrFoldDimOp(builder, loc, t->get(), p); 560 if (ShapedType::isDynamic(shape[p])) 561 args.push_back(up); 562 assert(codegen.highs[tensor][idx] == nullptr); 563 codegen.sizes[idx] = codegen.highs[tensor][idx] = up; 564 } 565 // Perform the required bufferization. Dense inputs materialize 566 // from the input tensors. Dense outputs need special handling. 567 // Sparse inputs use sparse primitives to obtain the values. 568 // We also accept in-place all-dense annotated "sparse" outputs. 569 Type elementType = getElementTypeOrSelf(t->get().getType()); 570 if (!enc) { 571 // Non-annotated dense tensors. 572 auto denseTp = MemRefType::get(shape, elementType); 573 if (tensor < op.getNumInputs()) 574 codegen.buffers[tensor] = 575 builder.create<bufferization::ToMemrefOp>(loc, denseTp, t->get()); 576 else 577 codegen.buffers[tensor] = 578 genOutputBuffer(codegen, builder, op, denseTp, args); 579 } else if (t == codegen.sparseOut) { 580 // True sparse output needs a lexIdx array. 581 Value rank = constantIndex(builder, loc, op.getRank(t)); 582 auto dynShape = {ShapedType::kDynamicSize}; 583 auto memTp = MemRefType::get(dynShape, builder.getIndexType()); 584 codegen.lexIdx = builder.create<memref::AllocaOp>(loc, memTp, rank); 585 codegen.lexVal = builder.create<memref::AllocaOp>( 586 loc, MemRefType::get({}, elementType)); 587 } else { 588 // Annotated sparse tensors. 589 auto dynShape = {ShapedType::kDynamicSize}; 590 auto sparseTp = MemRefType::get(dynShape, elementType); 591 codegen.buffers[tensor] = 592 builder.create<ToValuesOp>(loc, sparseTp, t->get()); 593 } 594 } 595 } 596 597 /// Constructs vector type. 598 static VectorType vectorType(CodeGen &codegen, Type etp) { 599 unsigned numScalableDims = codegen.options.enableVLAVectorization; 600 return VectorType::get(codegen.curVecLength, etp, numScalableDims); 601 } 602 603 /// Constructs vector type from pointer. 604 static VectorType vectorType(CodeGen &codegen, Value ptr) { 605 return vectorType(codegen, ptr.getType().cast<MemRefType>().getElementType()); 606 } 607 608 /// Constructs vector iteration mask. 609 static Value genVectorMask(CodeGen &codegen, OpBuilder &builder, Value iv, 610 Value lo, Value hi, Value step) { 611 Location loc = iv.getLoc(); 612 VectorType mtp = vectorType(codegen, builder.getI1Type()); 613 // Special case if the vector length evenly divides the trip count (for 614 // example, "for i = 0, 128, 16"). A constant all-true mask is generated 615 // so that all subsequent masked memory operations are immediately folded 616 // into unconditional memory operations. 617 IntegerAttr loInt, hiInt, stepInt; 618 if (matchPattern(lo, m_Constant(&loInt)) && 619 matchPattern(hi, m_Constant(&hiInt)) && 620 matchPattern(step, m_Constant(&stepInt))) { 621 if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0) 622 return builder.create<vector::BroadcastOp>( 623 loc, mtp, constantI1(builder, loc, true)); 624 } 625 // Otherwise, generate a vector mask that avoids overrunning the upperbound 626 // during vector execution. Here we rely on subsequent loop optimizations to 627 // avoid executing the mask in all iterations, for example, by splitting the 628 // loop into an unconditional vector loop and a scalar cleanup loop. 629 auto minMap = AffineMap::get( 630 /*dimCount=*/2, /*symbolCount=*/1, 631 {builder.getAffineSymbolExpr(0), 632 builder.getAffineDimExpr(0) - builder.getAffineDimExpr(1)}, 633 builder.getContext()); 634 Value end = 635 builder.createOrFold<AffineMinOp>(loc, minMap, ValueRange{hi, iv, step}); 636 return builder.create<vector::CreateMaskOp>(loc, mtp, end); 637 } 638 639 /// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi]. 640 static Value genVectorLoad(CodeGen &codegen, OpBuilder &builder, Value ptr, 641 ArrayRef<Value> args) { 642 Location loc = ptr.getLoc(); 643 VectorType vtp = vectorType(codegen, ptr); 644 Value pass = constantZero(builder, loc, vtp); 645 if (args.back().getType().isa<VectorType>()) { 646 SmallVector<Value, 4> scalarArgs(args.begin(), args.end()); 647 Value indexVec = args.back(); 648 scalarArgs.back() = constantIndex(builder, loc, 0); 649 return builder.create<vector::GatherOp>(loc, vtp, ptr, scalarArgs, indexVec, 650 codegen.curVecMask, pass); 651 } 652 return builder.create<vector::MaskedLoadOp>(loc, vtp, ptr, args, 653 codegen.curVecMask, pass); 654 } 655 656 /// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs. 657 static void genVectorStore(CodeGen &codegen, OpBuilder &builder, Value rhs, 658 Value ptr, ArrayRef<Value> args) { 659 Location loc = ptr.getLoc(); 660 if (args.back().getType().isa<VectorType>()) { 661 SmallVector<Value, 4> scalarArgs(args.begin(), args.end()); 662 Value indexVec = args.back(); 663 scalarArgs.back() = constantIndex(builder, loc, 0); 664 builder.create<vector::ScatterOp>(loc, ptr, scalarArgs, indexVec, 665 codegen.curVecMask, rhs); 666 return; 667 } 668 builder.create<vector::MaskedStoreOp>(loc, ptr, args, codegen.curVecMask, 669 rhs); 670 } 671 672 /// Generates a vectorized invariant. Here we rely on subsequent loop 673 /// optimizations to hoist the invariant broadcast out of the vector loop. 674 static Value genVectorInvariantValue(CodeGen &codegen, OpBuilder &builder, 675 Value val) { 676 VectorType vtp = vectorType(codegen, val.getType()); 677 return builder.create<vector::BroadcastOp>(val.getLoc(), vtp, val); 678 } 679 680 /// Generates an affine expression. 681 // 682 // TODO: generalize for sparse tensor subscripts 683 // 684 static Value genAffine(CodeGen &codegen, OpBuilder &builder, AffineExpr a, 685 Location loc) { 686 switch (a.getKind()) { 687 case AffineExprKind::DimId: { 688 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 689 return codegen.loops[idx]; // universal dense index 690 } 691 case AffineExprKind::Add: { 692 auto binOp = a.cast<AffineBinaryOpExpr>(); 693 return builder.create<arith::AddIOp>( 694 loc, genAffine(codegen, builder, binOp.getLHS(), loc), 695 genAffine(codegen, builder, binOp.getRHS(), loc)); 696 } 697 case AffineExprKind::Mul: { 698 auto binOp = a.cast<AffineBinaryOpExpr>(); 699 return builder.create<arith::MulIOp>( 700 loc, genAffine(codegen, builder, binOp.getLHS(), loc), 701 genAffine(codegen, builder, binOp.getRHS(), loc)); 702 } 703 case AffineExprKind::Constant: { 704 int64_t c = a.cast<AffineConstantExpr>().getValue(); 705 return constantIndex(builder, loc, c); 706 } 707 default: 708 llvm_unreachable("unexpected affine subscript"); 709 } 710 } 711 712 /// Generates index for load/store on sparse tensor. 713 static Value genIndex(CodeGen &codegen, linalg::GenericOp op, OpOperand *t) { 714 auto map = op.getTiedIndexingMap(t); 715 auto enc = getSparseTensorEncoding(t->get().getType()); 716 AffineExpr a = map.getResult(perm(enc, map.getNumResults() - 1)); 717 assert(a.getKind() == AffineExprKind::DimId); 718 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 719 return codegen.loops[idx]; 720 } 721 722 /// Generates subscript for load/store on a dense or sparse tensor. 723 static Value genSubscript(CodeGen &codegen, OpBuilder &builder, 724 linalg::GenericOp op, OpOperand *t, 725 SmallVector<Value, 4> &args) { 726 unsigned tensor = t->getOperandNumber(); 727 auto map = op.getTiedIndexingMap(t); 728 auto enc = getSparseTensorEncoding(t->get().getType()); 729 unsigned rank = map.getNumResults(); 730 if (enc) { 731 // Note that currently, all sparse subscripts are simple. 732 // TODO: accept affine too? 733 AffineExpr a = map.getResult(perm(enc, rank - 1)); 734 assert(a.getKind() == AffineExprKind::DimId); 735 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 736 assert(codegen.pidxs[tensor][idx] != nullptr); 737 args.push_back(codegen.pidxs[tensor][idx]); // position index 738 } else { 739 for (unsigned d = 0; d < rank; d++) { 740 AffineExpr a = map.getResult(perm(enc, d)); 741 args.push_back(genAffine(codegen, builder, a, op.getLoc())); 742 } 743 } 744 return codegen.buffers[tensor]; 745 } 746 747 /// Generates insertion code to implement dynamic tensor load. 748 static Value genInsertionLoad(CodeGen &codegen, OpBuilder &builder, 749 linalg::GenericOp op, OpOperand *t) { 750 Location loc = op.getLoc(); 751 // Direct lexicographic index order, tensor loads as zero. 752 if (!codegen.expValues) { 753 Type tp = getElementTypeOrSelf(t->get().getType()); 754 return constantZero(builder, loc, tp); 755 } 756 // Load from expanded access pattern. 757 Value index = genIndex(codegen, op, t); 758 return builder.create<memref::LoadOp>(loc, codegen.expValues, index); 759 } 760 761 /// Generates insertion code to implement dynamic tensor store. 762 static void genInsertionStore(CodeGen &codegen, OpBuilder &builder, 763 linalg::GenericOp op, OpOperand *t, Value rhs) { 764 Location loc = op.getLoc(); 765 // Direct insertion in lexicographic index order. 766 if (!codegen.expValues) { 767 builder.create<memref::StoreOp>(loc, rhs, codegen.lexVal); 768 builder.create<LexInsertOp>(loc, t->get(), codegen.lexIdx, codegen.lexVal); 769 return; 770 } 771 // Generates insertion code along expanded access pattern. 772 // if (!expFilled[i]) then 773 // expFilled[i] = true 774 // expAdded[inserts++] = i 775 // endif 776 // values[i] = rhs 777 Value index = genIndex(codegen, op, t); 778 Value fval = constantI1(builder, loc, false); 779 Value tval = constantI1(builder, loc, true); 780 // If statement. 781 Value filled = builder.create<memref::LoadOp>(loc, codegen.expFilled, index); 782 Value cond = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, 783 filled, fval); 784 scf::IfOp ifOp = builder.create<scf::IfOp>(loc, builder.getIndexType(), cond, 785 /*else=*/true); 786 // True branch. 787 builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); 788 builder.create<memref::StoreOp>(loc, tval, codegen.expFilled, index); 789 builder.create<memref::StoreOp>(loc, index, codegen.expAdded, 790 codegen.expCount); 791 Value one = constantIndex(builder, loc, 1); 792 Value add = builder.create<arith::AddIOp>(loc, codegen.expCount, one); 793 builder.create<scf::YieldOp>(loc, add); 794 // False branch. 795 builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); 796 builder.create<scf::YieldOp>(loc, codegen.expCount); 797 builder.setInsertionPointAfter(ifOp); 798 // Value assignment. 799 codegen.expCount = ifOp.getResult(0); 800 builder.create<memref::StoreOp>(loc, rhs, codegen.expValues, index); 801 } 802 803 /// Generates a load on a dense or sparse tensor. 804 static Value genTensorLoad(Merger &merger, CodeGen &codegen, OpBuilder &builder, 805 linalg::GenericOp op, unsigned exp) { 806 // Test if the load was hoisted to a higher loop nest. 807 Value val = merger.exp(exp).val; 808 if (val) { 809 if (codegen.curVecLength > 1 && !val.getType().isa<VectorType>()) 810 return genVectorInvariantValue(codegen, builder, val); 811 return val; 812 } 813 // Load during insertion. 814 OpOperand *t = op.getInputAndOutputOperands()[merger.exp(exp).tensor]; 815 if (t == codegen.sparseOut) 816 return genInsertionLoad(codegen, builder, op, t); 817 // Actual load. 818 SmallVector<Value, 4> args; 819 Value ptr = genSubscript(codegen, builder, op, t, args); 820 if (codegen.curVecLength > 1) 821 return genVectorLoad(codegen, builder, ptr, args); 822 return builder.create<memref::LoadOp>(op.getLoc(), ptr, args); 823 } 824 825 /// Generates a store on a dense or sparse tensor. 826 static void genTensorStore(Merger &merger, CodeGen &codegen, OpBuilder &builder, 827 linalg::GenericOp op, unsigned exp, Value rhs) { 828 Location loc = op.getLoc(); 829 // Test if this is a scalarized reduction. 830 if (codegen.redVal) { 831 if (codegen.curVecLength > 1) 832 rhs = builder.create<arith::SelectOp>(loc, codegen.curVecMask, rhs, 833 codegen.redVal); 834 updateReduc(merger, codegen, rhs); 835 return; 836 } 837 // Store during insertion. 838 OpOperand *t = op.getOutputOperand(0); 839 if (t == codegen.sparseOut) { 840 if (!rhs) { 841 // Only unary and binary are allowed to return uninitialized rhs 842 // to indicate missing output. 843 assert(merger.exp(exp).kind == kUnary || merger.exp(exp).kind == kBinary); 844 } else { 845 genInsertionStore(codegen, builder, op, t, rhs); 846 } 847 return; 848 } 849 // Actual store. 850 SmallVector<Value, 4> args; 851 Value ptr = genSubscript(codegen, builder, op, t, args); 852 if (codegen.curVecLength > 1) 853 genVectorStore(codegen, builder, rhs, ptr, args); 854 else 855 builder.create<memref::StoreOp>(loc, rhs, ptr, args); 856 } 857 858 /// Generates a pointer/index load from the sparse storage scheme. Narrower 859 /// data types need to be zero extended before casting the value into the 860 /// index type used for looping and indexing. 861 static Value genLoad(CodeGen &codegen, OpBuilder &builder, Location loc, 862 Value ptr, Value s) { 863 // See https://llvm.org/docs/GetElementPtr.html for some background on 864 // the complications described below. 865 if (codegen.curVecLength > 1) { 866 // Since the index vector is used in a subsequent gather/scatter operations, 867 // which effectively defines an unsigned pointer + signed index, we must 868 // zero extend the vector to an index width. For 8-bit and 16-bit values, 869 // an 32-bit index width suffices. For 32-bit values, zero extending the 870 // elements into 64-bit loses some performance since the 32-bit indexed 871 // gather/scatter is more efficient than the 64-bit index variant (if the 872 // negative 32-bit index space is unused, the enableSIMDIndex32 flag can 873 // preserve this performance). For 64-bit values, there is no good way 874 // to state that the indices are unsigned, with creates the potential of 875 // incorrect address calculations in the unlikely case we need such 876 // extremely large offsets. 877 Type etp = ptr.getType().cast<MemRefType>().getElementType(); 878 Value vload = genVectorLoad(codegen, builder, ptr, {s}); 879 if (!etp.isa<IndexType>()) { 880 if (etp.getIntOrFloatBitWidth() < 32) 881 vload = builder.create<arith::ExtUIOp>( 882 loc, vectorType(codegen, builder.getI32Type()), vload); 883 else if (etp.getIntOrFloatBitWidth() < 64 && 884 !codegen.options.enableSIMDIndex32) 885 vload = builder.create<arith::ExtUIOp>( 886 loc, vectorType(codegen, builder.getI64Type()), vload); 887 } 888 return vload; 889 } 890 // For the scalar case, we simply zero extend narrower indices into 64-bit 891 // values before casting to index without a performance penalty. Here too, 892 // however, indices that already are 64-bit, in theory, cannot express the 893 // full range as explained above. 894 Value load = builder.create<memref::LoadOp>(loc, ptr, s); 895 if (!load.getType().isa<IndexType>()) { 896 if (load.getType().getIntOrFloatBitWidth() < 64) 897 load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load); 898 load = 899 builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load); 900 } 901 return load; 902 } 903 904 /// Generates an invariant value. 905 static Value genInvariantValue(Merger &merger, CodeGen &codegen, 906 OpBuilder &builder, unsigned exp) { 907 Value val = merger.exp(exp).val; 908 if (codegen.curVecLength > 1) 909 return genVectorInvariantValue(codegen, builder, val); 910 return val; 911 } 912 913 /// Generates an address computation "sz * p + i". 914 static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc, 915 Value size, Value p, Value i) { 916 Value mul = builder.create<arith::MulIOp>(loc, size, p); 917 if (auto vtp = i.getType().dyn_cast<VectorType>()) { 918 Value inv = 919 builder.create<arith::IndexCastOp>(loc, vtp.getElementType(), mul); 920 mul = genVectorInvariantValue(codegen, builder, inv); 921 } 922 return builder.create<arith::AddIOp>(loc, mul, i); 923 } 924 925 /// Generates an index value. 926 static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx, 927 unsigned ldx) { 928 Value ival = codegen.loops[idx]; 929 Type itype = ival.getType(); 930 // During vectorization, we either encounter: 931 // (1) indices already in vector form, as in ... = ind[lo:hi], good to go, or 932 // (2) single index, as in ... = i, must convert to [i, i+1, ...] for inner i. 933 unsigned vl = codegen.curVecLength; 934 if (vl > 1 && !itype.isa<VectorType>()) { 935 Location loc = ival.getLoc(); 936 VectorType vtp = vectorType(codegen, itype); 937 ival = builder.create<vector::BroadcastOp>(loc, vtp, ival); 938 if (idx == ldx) { 939 Value incr; 940 if (vtp.isScalable()) { 941 Type stepvty = vectorType(codegen, builder.getI64Type()); 942 Value stepv = builder.create<LLVM::StepVectorOp>(loc, stepvty); 943 incr = builder.create<arith::IndexCastOp>(loc, vtp, stepv); 944 } else { 945 SmallVector<APInt, 4> integers; 946 for (unsigned i = 0; i < vl; i++) 947 integers.push_back(APInt(/*width=*/64, i)); 948 auto values = DenseElementsAttr::get(vtp, integers); 949 incr = builder.create<arith::ConstantOp>(loc, vtp, values); 950 } 951 ival = builder.create<arith::AddIOp>(loc, ival, incr); 952 } 953 } 954 return ival; 955 } 956 957 /// Semi-ring branches are simply inlined by the sparse compiler. Prior 958 /// analysis has verified that all computations are "local" to the inlined 959 /// branch or otherwise invariantly defined outside the loop nest, with the 960 /// exception of index computations, which need to be relinked to actual 961 /// inlined cloned code. 962 static Value relinkBranch(CodeGen &codegen, RewriterBase &rewriter, 963 Block *block, Value e, unsigned ldx) { 964 if (Operation *def = e.getDefiningOp()) { 965 if (auto indexOp = dyn_cast<linalg::IndexOp>(def)) 966 return genIndexValue(codegen, rewriter, indexOp.dim(), ldx); 967 if (def->getBlock() == block) { 968 for (unsigned i = 0, n = def->getNumOperands(); i < n; i++) 969 def->setOperand( 970 i, relinkBranch(codegen, rewriter, block, def->getOperand(i), ldx)); 971 } 972 } 973 return e; 974 } 975 976 /// Recursively generates tensor expression. 977 static Value genExp(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, 978 linalg::GenericOp op, unsigned exp, unsigned ldx) { 979 Location loc = op.getLoc(); 980 if (exp == -1u) 981 return Value(); 982 if (merger.exp(exp).kind == Kind::kTensor) 983 return genTensorLoad(merger, codegen, rewriter, op, exp); 984 if (merger.exp(exp).kind == Kind::kInvariant) 985 return genInvariantValue(merger, codegen, rewriter, exp); 986 if (merger.exp(exp).kind == Kind::kIndex) 987 return genIndexValue(codegen, rewriter, merger.exp(exp).index, ldx); 988 Value v0 = 989 genExp(merger, codegen, rewriter, op, merger.exp(exp).children.e0, ldx); 990 Value v1 = 991 genExp(merger, codegen, rewriter, op, merger.exp(exp).children.e1, ldx); 992 Value ee = merger.buildExp(rewriter, loc, exp, v0, v1); 993 if (ee && (merger.exp(exp).kind == Kind::kUnary || 994 merger.exp(exp).kind == Kind::kBinary || 995 merger.exp(exp).kind == Kind::kBinaryBranch)) 996 ee = relinkBranch(codegen, rewriter, ee.getParentBlock(), ee, ldx); 997 return ee; 998 } 999 1000 /// Determines if affine expression is invariant. 1001 static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a, 1002 unsigned ldx, bool &atLevel) { 1003 switch (a.getKind()) { 1004 case AffineExprKind::DimId: { 1005 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 1006 if (idx == ldx) 1007 atLevel = true; 1008 return codegen.loops[idx] != nullptr; // no longer in play? 1009 } 1010 case AffineExprKind::Add: 1011 case AffineExprKind::Mul: { 1012 auto binOp = a.cast<AffineBinaryOpExpr>(); 1013 return isInvariantAffine(codegen, binOp.getLHS(), ldx, atLevel) && 1014 isInvariantAffine(codegen, binOp.getRHS(), ldx, atLevel); 1015 } 1016 default: 1017 return true; 1018 } 1019 } 1020 1021 /// Hoists loop invariant tensor loads for which indices have been exhausted. 1022 static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1023 linalg::GenericOp op, unsigned exp, unsigned ldx, 1024 bool atStart, Kind last = Kind::kTensor) { 1025 if (exp == -1u) 1026 return; 1027 if (merger.exp(exp).kind == Kind::kTensor) { 1028 // Inspect tensor indices. 1029 bool atLevel = ldx == -1u; 1030 OpOperand *t = op.getInputAndOutputOperands()[merger.exp(exp).tensor]; 1031 auto map = op.getTiedIndexingMap(t); 1032 auto enc = getSparseTensorEncoding(t->get().getType()); 1033 for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { 1034 AffineExpr a = map.getResult(perm(enc, d)); 1035 if (!isInvariantAffine(codegen, a, ldx, atLevel)) 1036 return; // still in play 1037 } 1038 // All exhausted at this level (atLevel denotes exactly at this level). 1039 if (!atLevel) 1040 return; 1041 OpOperand *lhs = op.getOutputOperand(0); 1042 if (lhs == t) { 1043 // Start or end a scalarized reduction 1044 if (atStart) { 1045 Value load = genTensorLoad(merger, codegen, builder, op, exp); 1046 codegen.redKind = getReduction(last); 1047 codegen.redExp = exp; 1048 updateReduc(merger, codegen, load); 1049 } else { 1050 Value redVal = codegen.redVal; 1051 updateReduc(merger, codegen, Value()); 1052 codegen.redExp = -1u; 1053 codegen.redKind = kNoReduc; 1054 genTensorStore(merger, codegen, builder, op, exp, redVal); 1055 } 1056 } else { 1057 // Start or end loop invariant hoisting of a tensor load. 1058 merger.exp(exp).val = 1059 atStart ? genTensorLoad(merger, codegen, builder, op, exp) : Value(); 1060 } 1061 } else if (merger.exp(exp).kind != Kind::kInvariant && 1062 merger.exp(exp).kind != Kind::kIndex) { 1063 // Traverse into the binary operations. Note that we only hoist 1064 // tensor loads, since subsequent MLIR/LLVM passes know how to 1065 // deal with all other kinds of derived loop invariants. 1066 Kind last = merger.exp(exp).kind; 1067 unsigned e0 = merger.exp(exp).children.e0; 1068 unsigned e1 = merger.exp(exp).children.e1; 1069 genInvariants(merger, codegen, builder, op, e0, ldx, atStart, last); 1070 genInvariants(merger, codegen, builder, op, e1, ldx, atStart, last); 1071 } 1072 } 1073 1074 /// Generates an expanded access pattern in innermost dimension. 1075 static void genExpansion(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1076 linalg::GenericOp op, unsigned at, bool atStart) { 1077 OpOperand *lhs = codegen.sparseOut; 1078 if (!lhs || codegen.outerParNest != op.getRank(lhs) - 1 || 1079 at != codegen.outerParNest) 1080 return; // not needed at this level 1081 // Generate start or end of an expanded access pattern. 1082 Value tensor = lhs->get(); 1083 Location loc = op.getLoc(); 1084 if (atStart) { 1085 auto dynShape = {ShapedType::kDynamicSize}; 1086 Type etp = tensor.getType().cast<ShapedType>().getElementType(); 1087 Type t1 = MemRefType::get(dynShape, etp); 1088 Type t2 = MemRefType::get(dynShape, builder.getI1Type()); 1089 Type t3 = MemRefType::get(dynShape, builder.getIndexType()); 1090 Type t4 = builder.getIndexType(); 1091 auto res = 1092 builder.create<ExpandOp>(loc, TypeRange({t1, t2, t3, t4}), tensor); 1093 assert(res.getNumResults() == 4); 1094 assert(!codegen.expValues); 1095 codegen.expValues = res.getResult(0); 1096 codegen.expFilled = res.getResult(1); 1097 codegen.expAdded = res.getResult(2); 1098 codegen.expCount = res.getResult(3); 1099 } else { 1100 assert(codegen.expValues); 1101 builder.create<CompressOp>(loc, tensor, codegen.lexIdx, codegen.expValues, 1102 codegen.expFilled, codegen.expAdded, 1103 codegen.expCount); 1104 codegen.expValues = codegen.expFilled = codegen.expAdded = 1105 codegen.expCount = Value(); 1106 } 1107 } 1108 1109 /// Generates initialization code for the subsequent loop sequence at 1110 /// current index level. Returns true if the loop sequence needs to 1111 /// maintain the universal index. 1112 static bool genInit(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1113 linalg::GenericOp op, std::vector<unsigned> &topSort, 1114 unsigned at, BitVector &inits) { 1115 bool needsUniv = false; 1116 Location loc = op.getLoc(); 1117 unsigned idx = topSort[at]; 1118 1119 // Initialize sparse positions. 1120 for (unsigned b = 0, be = inits.size(); b < be; b++) { 1121 if (inits[b]) { 1122 unsigned tensor = merger.tensor(b); 1123 assert(idx == merger.index(b)); 1124 if (merger.isDim(b, Dim::kSparse)) { 1125 // Initialize sparse index. 1126 unsigned pat = at; 1127 for (; pat != 0; pat--) { 1128 if (codegen.pidxs[tensor][topSort[pat - 1]]) 1129 break; 1130 } 1131 Value ptr = codegen.pointers[tensor][idx]; 1132 Value one = constantIndex(builder, loc, 1); 1133 Value p0 = (pat == 0) ? constantIndex(builder, loc, 0) 1134 : codegen.pidxs[tensor][topSort[pat - 1]]; 1135 codegen.pidxs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p0); 1136 Value p1 = builder.create<arith::AddIOp>(loc, p0, one); 1137 codegen.highs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p1); 1138 } else { 1139 // Dense index still in play. 1140 needsUniv = true; 1141 } 1142 } 1143 } 1144 1145 // Initialize the universal dense index. 1146 codegen.loops[idx] = constantIndex(builder, loc, 0); 1147 return needsUniv; 1148 } 1149 1150 /// Returns vectorization strategy. Any implicit inner loop in the Linalg 1151 /// operation is a candidate. Whether it is actually converted to SIMD code 1152 /// depends on the requested strategy. 1153 static bool isVectorFor(CodeGen &codegen, bool isInner, bool isReduction, 1154 bool isSparse) { 1155 // Reject vectorization of sparse output, unless innermost is reduction. 1156 if (codegen.sparseOut && !isReduction) 1157 return false; 1158 // Inspect strategy. 1159 switch (codegen.options.vectorizationStrategy) { 1160 case SparseVectorizationStrategy::kNone: 1161 return false; 1162 case SparseVectorizationStrategy::kDenseInnerLoop: 1163 return isInner && !isSparse; 1164 case SparseVectorizationStrategy::kAnyStorageInnerLoop: 1165 return isInner; 1166 } 1167 llvm_unreachable("unexpected vectorization strategy"); 1168 } 1169 1170 /// Returns parallelization strategy. Any implicit loop in the Linalg operation 1171 /// that is marked "parallel" is a candidate. Whether it is actually converted 1172 /// to a parallel operation depends on the requested strategy. 1173 static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction, 1174 bool isSparse, bool isVector) { 1175 // Reject parallelization of sparse output. 1176 if (codegen.sparseOut) 1177 return false; 1178 // Inspect strategy. 1179 switch (codegen.options.parallelizationStrategy) { 1180 case SparseParallelizationStrategy::kNone: 1181 return false; 1182 case SparseParallelizationStrategy::kDenseOuterLoop: 1183 return isOuter && !isSparse && !isReduction && !isVector; 1184 case SparseParallelizationStrategy::kAnyStorageOuterLoop: 1185 return isOuter && !isReduction && !isVector; 1186 case SparseParallelizationStrategy::kDenseAnyLoop: 1187 return !isSparse && !isReduction && !isVector; 1188 case SparseParallelizationStrategy::kAnyStorageAnyLoop: 1189 return !isReduction && !isVector; 1190 } 1191 llvm_unreachable("unexpected parallelization strategy"); 1192 } 1193 1194 /// Checks unit stride for dense tensors. The iteration graph may have ignored 1195 /// dense access patterns in order to avoid cycles (sparse access patterns are 1196 /// always placed innermost), but that means dense access has become strided. 1197 /// This prevents effective vectorization. 1198 static bool denseUnitStrides(Merger &merger, linalg::GenericOp op, 1199 unsigned idx) { 1200 for (OpOperand *t : op.getInputAndOutputOperands()) { 1201 if (!getSparseTensorEncoding(t->get().getType())) { 1202 auto map = op.getTiedIndexingMap(t); 1203 for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { 1204 AffineExpr a = map.getResult(d); 1205 // Report non-unit stride if innermost index appears at an outer 1206 // dimension (true non-unit stride) or if the innermost index appears 1207 // in a compound subscript in the innermost dimension. Even if the 1208 // latter is unit stride, it does not play well with scatter/gather. 1209 // TODO: accept unit stride affine innermost like a[i,j+k+1]? 1210 if (a.isFunctionOfDim(idx) && 1211 ((d != rank - 1) || (a.getKind() != AffineExprKind::DimId))) 1212 return false; 1213 } 1214 } 1215 } 1216 return true; 1217 } 1218 1219 /// Generates a for-loop on a single index. 1220 static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1221 linalg::GenericOp op, bool isOuter, bool isInner, 1222 unsigned idx, BitVector &indices) { 1223 unsigned fb = indices.find_first(); 1224 unsigned tensor = merger.tensor(fb); 1225 assert(idx == merger.index(fb)); 1226 auto iteratorTypes = op.iterator_types().getValue(); 1227 bool isReduction = isReductionIterator(iteratorTypes[idx]); 1228 bool isSparse = merger.isDim(fb, Dim::kSparse); 1229 bool isVector = isVectorFor(codegen, isInner, isReduction, isSparse) && 1230 denseUnitStrides(merger, op, idx); 1231 bool isParallel = 1232 isParallelFor(codegen, isOuter, isReduction, isSparse, isVector); 1233 1234 // Prepare vector length. 1235 if (isVector) 1236 codegen.curVecLength = codegen.options.vectorLength; 1237 1238 // Loop bounds and increment. 1239 Location loc = op.getLoc(); 1240 Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx]; 1241 Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx]; 1242 Value step = constantIndex(builder, loc, codegen.curVecLength); 1243 if (isVector && codegen.options.enableVLAVectorization) { 1244 Value vscale = builder.create<vector::VectorScaleOp>( 1245 loc, IndexType::get(builder.getContext())); 1246 step = builder.create<arith::MulIOp>(loc, vscale, step); 1247 } 1248 1249 // Emit a parallel loop. 1250 if (isParallel) { 1251 assert(!isVector); 1252 scf::ParallelOp parOp = builder.create<scf::ParallelOp>(loc, lo, hi, step); 1253 if (isSparse) 1254 codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0]; 1255 else 1256 codegen.loops[idx] = parOp.getInductionVars()[0]; 1257 builder.setInsertionPointToStart(parOp.getBody()); 1258 return parOp; 1259 } 1260 1261 // Emit a sequential or vector loop. 1262 SmallVector<Value, 4> operands; 1263 if (codegen.redVal) { 1264 // In a vector loop, bring reduction into SIMD form, if not already. 1265 if (isVector && !codegen.redVal.getType().isa<VectorType>()) { 1266 VectorType vtp = vectorType(codegen, codegen.redVal.getType()); 1267 Value vred = genVectorReducInit(codegen, builder, loc, vtp); 1268 updateReduc(merger, codegen, vred); 1269 } 1270 operands.push_back(codegen.redVal); 1271 } 1272 if (codegen.expValues) 1273 operands.push_back(codegen.expCount); 1274 scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, operands); 1275 if (codegen.redVal) 1276 updateReduc(merger, codegen, forOp.getRegionIterArgs().front()); 1277 if (codegen.expValues) 1278 codegen.expCount = forOp.getRegionIterArgs().back(); 1279 // Assign induction variable to sparse or dense index. 1280 Value iv = forOp.getInductionVar(); 1281 if (isSparse) 1282 codegen.pidxs[tensor][idx] = iv; 1283 else 1284 codegen.loops[idx] = iv; 1285 builder.setInsertionPointToStart(forOp.getBody()); 1286 // Share vector iteration mask between all subsequent loads/stores. 1287 if (isVector) 1288 codegen.curVecMask = genVectorMask(codegen, builder, iv, lo, hi, step); 1289 return forOp; 1290 } 1291 1292 /// Emit a while-loop for co-iteration over multiple indices. 1293 static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1294 linalg::GenericOp op, unsigned idx, bool needsUniv, 1295 BitVector &indices) { 1296 SmallVector<Type, 4> types; 1297 SmallVector<Value, 4> operands; 1298 // Construct the while-loop with a parameter for each index. 1299 Type indexType = builder.getIndexType(); 1300 for (unsigned b = 0, be = indices.size(); b < be; b++) { 1301 if (indices[b] && merger.isDim(b, Dim::kSparse)) { 1302 unsigned tensor = merger.tensor(b); 1303 assert(idx == merger.index(b)); 1304 types.push_back(indexType); 1305 operands.push_back(codegen.pidxs[tensor][idx]); 1306 } 1307 } 1308 if (codegen.redVal) { 1309 types.push_back(codegen.redVal.getType()); 1310 operands.push_back(codegen.redVal); 1311 } 1312 if (codegen.expValues) { 1313 types.push_back(indexType); 1314 operands.push_back(codegen.expCount); 1315 } 1316 if (needsUniv) { 1317 types.push_back(indexType); 1318 operands.push_back(codegen.loops[idx]); 1319 } 1320 assert(types.size() == operands.size()); 1321 Location loc = op.getLoc(); 1322 scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands); 1323 1324 SmallVector<Location> locs(types.size(), loc); 1325 Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); 1326 Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); 1327 1328 // Build the "before" region, which effectively consists 1329 // of a conjunction of "i < upper" tests on all induction. 1330 builder.setInsertionPointToStart(&whileOp.getBefore().front()); 1331 Value cond; 1332 unsigned o = 0; 1333 for (unsigned b = 0, be = indices.size(); b < be; b++) { 1334 if (indices[b] && merger.isDim(b, Dim::kSparse)) { 1335 unsigned tensor = merger.tensor(b); 1336 assert(idx == merger.index(b)); 1337 Value op1 = before->getArgument(o); 1338 Value op2 = codegen.highs[tensor][idx]; 1339 Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, 1340 op1, op2); 1341 cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc; 1342 codegen.pidxs[tensor][idx] = after->getArgument(o++); 1343 } 1344 } 1345 if (codegen.redVal) 1346 updateReduc(merger, codegen, after->getArgument(o++)); 1347 if (codegen.expValues) 1348 codegen.expCount = after->getArgument(o++); 1349 if (needsUniv) 1350 codegen.loops[idx] = after->getArgument(o++); 1351 assert(o == operands.size()); 1352 builder.create<scf::ConditionOp>(loc, cond, before->getArguments()); 1353 builder.setInsertionPointToStart(&whileOp.getAfter().front()); 1354 return whileOp; 1355 } 1356 1357 /// Generates a for-loop or a while-loop, depending on whether it implements 1358 /// singleton iteration or co-iteration over the given conjunction. 1359 static Operation *genLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1360 linalg::GenericOp op, std::vector<unsigned> &topSort, 1361 unsigned at, bool needsUniv, BitVector &indices) { 1362 unsigned idx = topSort[at]; 1363 if (indices.count() == 1) { 1364 bool isOuter = at == 0; 1365 bool isInner = at == topSort.size() - 1; 1366 return genFor(merger, codegen, builder, op, isOuter, isInner, idx, indices); 1367 } 1368 return genWhile(merger, codegen, builder, op, idx, needsUniv, indices); 1369 } 1370 1371 /// Generates the local variables for this loop, consisting of the sparse 1372 /// indices, restored universal dense index, and dense positions. 1373 static void genLocals(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1374 linalg::GenericOp op, std::vector<unsigned> &topSort, 1375 unsigned at, bool needsUniv, BitVector &locals) { 1376 Location loc = op.getLoc(); 1377 unsigned idx = topSort[at]; 1378 1379 // Initialize sparse indices. 1380 Value min; 1381 for (unsigned b = 0, be = locals.size(); b < be; b++) { 1382 if (locals[b] && merger.isDim(b, Dim::kSparse)) { 1383 unsigned tensor = merger.tensor(b); 1384 assert(idx == merger.index(b)); 1385 Value ptr = codegen.indices[tensor][idx]; 1386 Value s = codegen.pidxs[tensor][idx]; 1387 Value load = genLoad(codegen, builder, loc, ptr, s); 1388 codegen.idxs[tensor][idx] = load; 1389 if (!needsUniv) { 1390 if (min) { 1391 Value cmp = builder.create<arith::CmpIOp>( 1392 loc, arith::CmpIPredicate::ult, load, min); 1393 min = builder.create<arith::SelectOp>(loc, cmp, load, min); 1394 } else { 1395 min = load; 1396 } 1397 } 1398 } 1399 } 1400 1401 // Merge dense universal index over minimum. 1402 if (min) { 1403 assert(!needsUniv); 1404 codegen.loops[idx] = min; 1405 } 1406 1407 // Initialize dense positions. Note that we generate dense indices of the 1408 // output tensor unconditionally, since they may not appear in the lattice, 1409 // but may be needed for linearized codegen. 1410 for (unsigned b = 0, be = locals.size(); b < be; b++) { 1411 if ((locals[b] || merger.isOutTensor(b, idx)) && 1412 merger.isDim(b, Dim::kDense)) { 1413 unsigned tensor = merger.tensor(b); 1414 assert(idx == merger.index(b)); 1415 unsigned pat = at; 1416 for (; pat != 0; pat--) 1417 if (codegen.pidxs[tensor][topSort[pat - 1]]) 1418 break; 1419 Value p = (pat == 0) ? constantIndex(builder, loc, 0) 1420 : codegen.pidxs[tensor][topSort[pat - 1]]; 1421 codegen.pidxs[tensor][idx] = genAddress( 1422 codegen, builder, loc, codegen.sizes[idx], p, codegen.loops[idx]); 1423 } 1424 } 1425 1426 // Move the insertion indices in lexicographic index order. During access 1427 // pattern expansion, we can skip setting the innermost dimension. 1428 if (codegen.sparseOut && !codegen.expValues) { 1429 Value pos = constantIndex(builder, loc, at); 1430 builder.create<memref::StoreOp>(loc, codegen.loops[idx], codegen.lexIdx, 1431 pos); 1432 } 1433 } 1434 1435 /// Generates the induction structure for a while-loop. 1436 static void genWhileInduction(Merger &merger, CodeGen &codegen, 1437 OpBuilder &builder, linalg::GenericOp op, 1438 unsigned idx, bool needsUniv, 1439 BitVector &induction, scf::WhileOp whileOp) { 1440 Location loc = op.getLoc(); 1441 // Finalize each else branch of all if statements. 1442 if (codegen.redVal || codegen.expValues) { 1443 while (auto ifOp = dyn_cast_or_null<scf::IfOp>( 1444 builder.getInsertionBlock()->getParentOp())) { 1445 unsigned y = 0; 1446 SmallVector<Value, 4> yields; 1447 if (codegen.redVal) { 1448 yields.push_back(codegen.redVal); 1449 updateReduc(merger, codegen, ifOp.getResult(y++)); 1450 } 1451 if (codegen.expValues) { 1452 yields.push_back(codegen.expCount); 1453 codegen.expCount = ifOp->getResult(y++); 1454 } 1455 assert(y == yields.size()); 1456 builder.create<scf::YieldOp>(loc, yields); 1457 builder.setInsertionPointAfter(ifOp); 1458 } 1459 } 1460 builder.setInsertionPointToEnd(&whileOp.getAfter().front()); 1461 // Finalize the induction. Note that the induction could be performed 1462 // in the individual if-branches to avoid re-evaluating the conditions. 1463 // However, that would result in a rather elaborate forest of yield 1464 // instructions during code generation. Moreover, performing the induction 1465 // after the if-statements more closely resembles code generated by TACO. 1466 unsigned o = 0; 1467 SmallVector<Value, 4> operands; 1468 Value one = constantIndex(builder, loc, 1); 1469 for (unsigned b = 0, be = induction.size(); b < be; b++) { 1470 if (induction[b] && merger.isDim(b, Dim::kSparse)) { 1471 unsigned tensor = merger.tensor(b); 1472 assert(idx == merger.index(b)); 1473 Value op1 = codegen.idxs[tensor][idx]; 1474 Value op2 = codegen.loops[idx]; 1475 Value op3 = codegen.pidxs[tensor][idx]; 1476 Value cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, 1477 op1, op2); 1478 Value add = builder.create<arith::AddIOp>(loc, op3, one); 1479 operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3)); 1480 codegen.pidxs[tensor][idx] = whileOp->getResult(o++); 1481 } 1482 } 1483 if (codegen.redVal) { 1484 operands.push_back(codegen.redVal); 1485 updateReduc(merger, codegen, whileOp->getResult(o++)); 1486 } 1487 if (codegen.expValues) { 1488 operands.push_back(codegen.expCount); 1489 codegen.expCount = whileOp->getResult(o++); 1490 } 1491 if (needsUniv) { 1492 operands.push_back( 1493 builder.create<arith::AddIOp>(loc, codegen.loops[idx], one)); 1494 codegen.loops[idx] = whileOp->getResult(o++); 1495 } 1496 assert(o == operands.size()); 1497 builder.create<scf::YieldOp>(loc, operands); 1498 builder.setInsertionPointAfter(whileOp); 1499 } 1500 1501 /// Generates the induction structure for a for-loop. 1502 static void genForInduction(Merger &merger, CodeGen &codegen, 1503 OpBuilder &builder, linalg::GenericOp op, 1504 Operation *loop) { 1505 Location loc = op.getLoc(); 1506 unsigned o = 0; 1507 SmallVector<Value, 4> operands; 1508 if (codegen.redVal) { 1509 operands.push_back(codegen.redVal); 1510 updateReduc(merger, codegen, loop->getResult(o++)); 1511 } 1512 if (codegen.expValues) { 1513 operands.push_back(codegen.expCount); 1514 codegen.expCount = loop->getResult(o++); 1515 } 1516 assert(o == operands.size()); 1517 if (o > 0) 1518 builder.create<scf::YieldOp>(loc, operands); 1519 builder.setInsertionPointAfter(loop); 1520 } 1521 1522 /// Generates a single if-statement within a while-loop. 1523 static scf::IfOp genIf(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1524 linalg::GenericOp op, unsigned idx, 1525 BitVector &conditions) { 1526 Location loc = op.getLoc(); 1527 SmallVector<Type, 4> types; 1528 Value cond; 1529 for (unsigned b = 0, be = conditions.size(); b < be; b++) { 1530 if (conditions[b]) { 1531 unsigned tensor = merger.tensor(b); 1532 assert(idx == merger.index(b)); 1533 Value clause; 1534 if (merger.isDim(b, Dim::kSparse)) { 1535 Value op1 = codegen.idxs[tensor][idx]; 1536 Value op2 = codegen.loops[idx]; 1537 clause = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, 1538 op1, op2); 1539 } else { 1540 clause = constantI1(builder, loc, true); 1541 } 1542 cond = cond ? builder.create<arith::AndIOp>(loc, cond, clause) : clause; 1543 } 1544 } 1545 if (codegen.redVal) 1546 types.push_back(codegen.redVal.getType()); 1547 if (codegen.expValues) 1548 types.push_back(builder.getIndexType()); 1549 scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, cond, /*else=*/true); 1550 builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); 1551 return ifOp; 1552 } 1553 1554 /// Generates end of true branch of if-statement within a while-loop. 1555 static void endIf(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1556 linalg::GenericOp op, scf::IfOp ifOp, Operation *loop, 1557 Value redInput, Value cntInput) { 1558 SmallVector<Value, 4> operands; 1559 if (codegen.redVal) { 1560 operands.push_back(codegen.redVal); 1561 updateReduc(merger, codegen, redInput); 1562 } 1563 if (codegen.expValues) { 1564 operands.push_back(codegen.expCount); 1565 codegen.expCount = cntInput; 1566 } 1567 if (!operands.empty()) 1568 builder.create<scf::YieldOp>(op.getLoc(), operands); 1569 builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); 1570 } 1571 1572 //===----------------------------------------------------------------------===// 1573 // Sparse compiler synthesis methods (loop sequence). 1574 //===----------------------------------------------------------------------===// 1575 1576 /// Starts a loop sequence at given level. Returns true if 1577 /// the universal loop index must be maintained at this level. 1578 static bool startLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1579 linalg::GenericOp op, std::vector<unsigned> &topSort, 1580 unsigned exp, unsigned at, unsigned idx, unsigned ldx, 1581 unsigned lts) { 1582 assert(codegen.curVecLength == 1); 1583 assert(!codegen.loops[idx]); 1584 // Emit invariants at this loop sequence level. 1585 genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/true); 1586 // Emit access pattern expansion for sparse tensor output. 1587 genExpansion(merger, codegen, builder, op, at, /*atStart=*/true); 1588 // Emit further intitialization at this loop sequence level. 1589 unsigned l0 = merger.set(lts)[0]; 1590 bool needsUniv = 1591 genInit(merger, codegen, builder, op, topSort, at, merger.lat(l0).bits); 1592 // Maintain the universal index only if it is actually 1593 // consumed by a subsequent lattice point. 1594 if (needsUniv) { 1595 unsigned lsize = merger.set(lts).size(); 1596 for (unsigned i = 1; i < lsize; i++) { 1597 unsigned li = merger.set(lts)[i]; 1598 if (!merger.hasAnyDimOf(merger.lat(li).simple, Dim::kSparse)) 1599 return true; 1600 } 1601 } 1602 return false; 1603 } 1604 1605 /// Starts a single loop in current sequence. 1606 static Operation *startLoop(Merger &merger, CodeGen &codegen, 1607 OpBuilder &builder, linalg::GenericOp op, 1608 std::vector<unsigned> &topSort, unsigned at, 1609 unsigned li, bool needsUniv) { 1610 assert(codegen.curVecLength == 1); 1611 // Emit the for/while-loop control. 1612 Operation *loop = genLoop(merger, codegen, builder, op, topSort, at, 1613 needsUniv, merger.lat(li).simple); 1614 // Emit the locals for this loop. 1615 genLocals(merger, codegen, builder, op, topSort, at, needsUniv, 1616 merger.lat(li).bits); 1617 return loop; 1618 } 1619 1620 /// Ends a single loop in current sequence. Returns new values for needsUniv. 1621 static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1622 linalg::GenericOp op, Operation *loop, unsigned idx, 1623 unsigned li, bool needsUniv) { 1624 codegen.curVecLength = 1; 1625 // End a while-loop. 1626 if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) { 1627 genWhileInduction(merger, codegen, builder, op, idx, needsUniv, 1628 merger.lat(li).bits, whileOp); 1629 return needsUniv; 1630 } 1631 // End a for-loop. 1632 genForInduction(merger, codegen, builder, op, loop); 1633 return false; 1634 } 1635 1636 /// Ends a loop sequence at given level. 1637 static void endLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1638 linalg::GenericOp op, unsigned exp, unsigned at, 1639 unsigned idx, unsigned ldx) { 1640 assert(codegen.curVecLength == 1); 1641 codegen.loops[idx] = Value(); 1642 // Bring a pending reduction back from SIMD form when sequence ends. 1643 if (codegen.redVal) 1644 if (auto vtp = codegen.redVal.getType().dyn_cast<VectorType>()) 1645 updateReduc(merger, codegen, 1646 genVectorReducEnd(codegen, builder, op.getLoc(), vtp)); 1647 // Unmark bookkeeping of invariants and loop index. 1648 genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/false); 1649 // Finalize access pattern expansion for sparse tensor output. 1650 genExpansion(merger, codegen, builder, op, at, /*atStart=*/false); 1651 } 1652 1653 /// Recursively generates code while computing iteration lattices in order 1654 /// to manage the complexity of implementing co-iteration over unions 1655 /// and intersections of sparse iterations spaces. 1656 static void genStmt(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, 1657 linalg::GenericOp op, std::vector<unsigned> &topSort, 1658 unsigned exp, unsigned at) { 1659 // At each leaf, assign remaining tensor (sub)expression to output tensor. 1660 if (at == topSort.size()) { 1661 unsigned ldx = topSort[at - 1]; 1662 Value rhs = genExp(merger, codegen, rewriter, op, exp, ldx); 1663 genTensorStore(merger, codegen, rewriter, op, exp, rhs); 1664 return; 1665 } 1666 1667 // Construct iteration lattices for current loop index, with L0 at top. 1668 unsigned idx = topSort[at]; 1669 unsigned ldx = at == 0 ? -1u : topSort[at - 1]; 1670 unsigned lts = merger.optimizeSet(merger.buildLattices(exp, idx)); 1671 1672 // Start a loop sequence. 1673 bool needsUniv = startLoopSeq(merger, codegen, rewriter, op, topSort, exp, at, 1674 idx, ldx, lts); 1675 1676 // Emit a loop for every lattice point L0 >= Li in this loop sequence. 1677 unsigned lsize = merger.set(lts).size(); 1678 for (unsigned i = 0; i < lsize; i++) { 1679 // Start a loop. 1680 unsigned li = merger.set(lts)[i]; 1681 Operation *loop = 1682 startLoop(merger, codegen, rewriter, op, topSort, at, li, needsUniv); 1683 1684 // Visit all lattices points with Li >= Lj to generate the 1685 // loop-body, possibly with if statements for coiteration. 1686 Value redInput = codegen.redVal; 1687 Value cntInput = codegen.expCount; 1688 bool isWhile = dyn_cast<scf::WhileOp>(loop) != nullptr; 1689 for (unsigned j = 0; j < lsize; j++) { 1690 unsigned lj = merger.set(lts)[j]; 1691 unsigned ej = merger.lat(lj).exp; 1692 if (li == lj || merger.latGT(li, lj)) { 1693 // Recurse into body of each branch. 1694 if (isWhile) { 1695 scf::IfOp ifOp = 1696 genIf(merger, codegen, rewriter, op, idx, merger.lat(lj).simple); 1697 genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1); 1698 endIf(merger, codegen, rewriter, op, ifOp, loop, redInput, cntInput); 1699 } else { 1700 genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1); 1701 } 1702 } 1703 } 1704 1705 // End a loop. 1706 needsUniv = 1707 endLoop(merger, codegen, rewriter, op, loop, idx, li, needsUniv); 1708 } 1709 1710 // End a loop sequence. 1711 endLoopSeq(merger, codegen, rewriter, op, exp, at, idx, ldx); 1712 } 1713 1714 /// Converts the result computed by the sparse kernel into the required form. 1715 static void genResult(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, 1716 linalg::GenericOp op) { 1717 OpOperand *lhs = op.getOutputOperand(0); 1718 Type resType = lhs->get().getType(); 1719 if (getSparseTensorEncoding(resType)) { 1720 // The sparse tensor rematerializes from the original sparse tensor's 1721 // underlying sparse storage format. 1722 rewriter.replaceOpWithNewOp<LoadOp>(op, resType, lhs->get(), 1723 codegen.sparseOut == lhs); 1724 } else { 1725 // To rematerialize an non-annotated tensor, simply load it 1726 // from the bufferized value. 1727 Value val = codegen.buffers.back(); // value array 1728 rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, resType, val); 1729 } 1730 } 1731 1732 //===----------------------------------------------------------------------===// 1733 // Sparse compiler rewriting methods. 1734 //===----------------------------------------------------------------------===// 1735 1736 namespace { 1737 1738 /// Sparse rewriting rule for generic Lingalg operation. 1739 struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> { 1740 public: 1741 GenericOpSparsifier(MLIRContext *context, SparsificationOptions o) 1742 : OpRewritePattern<linalg::GenericOp>(context), options(o) {} 1743 1744 LogicalResult matchAndRewrite(linalg::GenericOp op, 1745 PatternRewriter &rewriter) const override { 1746 // Detects sparse annotations and translate the per-dimension sparsity 1747 // information for all tensors to loop indices in the kernel. 1748 assert(op.getNumOutputs() == 1); 1749 unsigned numTensors = op.getNumInputsAndOutputs(); 1750 unsigned numLoops = op.iterator_types().getValue().size(); 1751 Merger merger(numTensors, numLoops); 1752 if (!findSparseAnnotations(merger, op)) 1753 return failure(); 1754 1755 // Computes a topologically sorted iteration graph to ensure tensors 1756 // are visited in natural index order. Gradually relaxes the considered 1757 // constraints until an acyclic iteration graph results, such that sparse 1758 // code generation can proceed. As a last resort, an attempt is made 1759 // to resolve cycles by inserting a conversion. 1760 std::vector<unsigned> topSort; 1761 if (!computeIterationGraph(merger, op, topSort, SortMask::kIncludeAll) && 1762 !computeIterationGraph(merger, op, topSort, SortMask::kIncludeUndef) && 1763 !computeIterationGraph(merger, op, topSort, SortMask::kIncludeDense) && 1764 !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly)) { 1765 return resolveCycle(merger, rewriter, op); 1766 } 1767 1768 // Builds the tensor expression for the Linalg operation in SSA form. 1769 Optional<unsigned> optExp = merger.buildTensorExpFromLinalg(op); 1770 if (!optExp.hasValue()) 1771 return failure(); 1772 unsigned exp = optExp.getValue(); 1773 1774 // Rejects an inadmissable tensor expression. 1775 OpOperand *sparseOut = nullptr; 1776 unsigned outerParNest = 0; 1777 if (!isAdmissableTensorExp(merger, op, topSort, exp, &sparseOut, 1778 outerParNest)) 1779 return failure(); 1780 1781 // Recursively generates code. 1782 merger.setHasSparseOut(sparseOut != nullptr); 1783 CodeGen codegen(options, numTensors, numLoops, sparseOut, outerParNest); 1784 genBuffers(merger, codegen, rewriter, op); 1785 genStmt(merger, codegen, rewriter, op, topSort, exp, 0); 1786 genResult(merger, codegen, rewriter, op); 1787 return success(); 1788 } 1789 1790 private: 1791 // Last resort cycle resolution. 1792 LogicalResult resolveCycle(Merger &merger, PatternRewriter &rewriter, 1793 linalg::GenericOp op) const { 1794 // Compute topological sort while leaving out every 1795 // sparse input tensor in succession until an acylic 1796 // iteration graph results. 1797 std::vector<unsigned> topSort; 1798 for (OpOperand *t : op.getInputOperands()) { 1799 unsigned tensor = t->getOperandNumber(); 1800 Value tval = t->get(); 1801 auto srcEnc = getSparseTensorEncoding(tval.getType()); 1802 if (!srcEnc || 1803 !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly, t)) 1804 continue; 1805 // Found an input tensor that resolves the cycle by inserting a 1806 // conversion into a sparse tensor that adheres to the iteration 1807 // graph order. Also releases the temporary sparse tensor. 1808 // 1809 // TODO: investigate fusing the conversion with computation, 1810 // especially if it is a direct yield! 1811 // 1812 auto srcTp = tval.getType().cast<RankedTensorType>(); 1813 auto dstEnc = SparseTensorEncodingAttr::get( 1814 op->getContext(), srcEnc.getDimLevelType(), 1815 permute(getContext(), op.getTiedIndexingMap(t), topSort), // new order 1816 srcEnc.getPointerBitWidth(), srcEnc.getIndexBitWidth()); 1817 auto dstTp = RankedTensorType::get(srcTp.getShape(), 1818 srcTp.getElementType(), dstEnc); 1819 auto convert = rewriter.create<ConvertOp>(tval.getLoc(), dstTp, tval); 1820 op->setOperand(tensor, convert); 1821 rewriter.setInsertionPointAfter(op); 1822 rewriter.create<ReleaseOp>(tval.getLoc(), convert); 1823 return success(); 1824 } 1825 // Cannot be resolved with a single conversion. 1826 // TODO: convert more than one? 1827 return failure(); 1828 } 1829 1830 /// Options to control sparse code generation. 1831 SparsificationOptions options; 1832 }; 1833 1834 } // namespace 1835 1836 /// Populates the given patterns list with rewriting rules required for 1837 /// the sparsification of linear algebra operations. 1838 void mlir::populateSparsificationPatterns( 1839 RewritePatternSet &patterns, const SparsificationOptions &options) { 1840 patterns.add<GenericOpSparsifier>(patterns.getContext(), options); 1841 } 1842