1 //===- Sparsification.cpp - Implementation of sparsification --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements converting sparse tensor types to actual sparse code. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "CodegenUtils.h" 14 15 #include "mlir/Dialect/Affine/IR/AffineOps.h" 16 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 17 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" 18 #include "mlir/Dialect/Bufferization/IR/Bufferization.h" 19 #include "mlir/Dialect/Func/IR/FuncOps.h" 20 #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 21 #include "mlir/Dialect/Linalg/IR/Linalg.h" 22 #include "mlir/Dialect/Linalg/Utils/Utils.h" 23 #include "mlir/Dialect/MemRef/IR/MemRef.h" 24 #include "mlir/Dialect/SCF/SCF.h" 25 #include "mlir/Dialect/SCF/Transforms.h" 26 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" 27 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" 28 #include "mlir/Dialect/SparseTensor/Utils/Merger.h" 29 #include "mlir/Dialect/Vector/IR/VectorOps.h" 30 #include "mlir/IR/Matchers.h" 31 #include "mlir/IR/TensorEncoding.h" 32 #include "llvm/ADT/SmallBitVector.h" 33 34 using namespace mlir; 35 using namespace mlir::sparse_tensor; 36 37 //===----------------------------------------------------------------------===// 38 // Declarations of data structures. 39 //===----------------------------------------------------------------------===// 40 41 namespace { 42 43 // Iteration graph sorting. 44 enum SortMask { kSparseOnly = 0x0, kIncludeDense = 0x1, kIncludeUndef = 0x2 }; 45 46 // Reduction kinds. 47 enum Reduction { kNoReduc, kSum, kProduct, kAnd, kOr, kXor }; 48 49 // Code generation. 50 struct CodeGen { 51 CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops, 52 OpOperand *op, unsigned nest) 53 : options(o), loops(numLoops), sizes(numLoops), buffers(numTensors), 54 pointers(numTensors, std::vector<Value>(numLoops)), 55 indices(numTensors, std::vector<Value>(numLoops)), 56 highs(numTensors, std::vector<Value>(numLoops)), 57 pidxs(numTensors, std::vector<Value>(numLoops)), 58 idxs(numTensors, std::vector<Value>(numLoops)), redVal(), sparseOut(op), 59 outerParNest(nest), lexIdx(), expValues(), expFilled(), expAdded(), 60 expCount(), curVecMask() {} 61 /// Sparsification options. 62 SparsificationOptions options; 63 /// Universal dense indices and upper bounds (by index). The loops array 64 /// is updated with the value of the universal dense index in the current 65 /// loop. The sizes array is set once with the inferred dimension sizes. 66 std::vector<Value> loops; 67 std::vector<Value> sizes; 68 /// Buffers for storing dense and sparse numerical values (by tensor). 69 /// This array is set once during bufferization of all tensors. 70 std::vector<Value> buffers; 71 /// Sparse storage schemes (1-D): pointers and indices (by tensor and index). 72 /// This array is set once during bufferization of all sparse tensors. 73 std::vector<std::vector<Value>> pointers; 74 std::vector<std::vector<Value>> indices; 75 /// Sparse iteration information (by tensor and index). These arrays 76 /// are updated to remain current within the current loop. 77 std::vector<std::vector<Value>> highs; 78 std::vector<std::vector<Value>> pidxs; 79 std::vector<std::vector<Value>> idxs; 80 /// Current reduction, updated during code generation. When indices of a 81 /// reduction are exhausted, all inner loops can use a scalarized reduction. 82 unsigned redExp = -1u; 83 Value redVal; 84 Reduction redKind = kNoReduc; 85 // Sparse tensor as output. Implemented either through direct injective 86 // insertion in lexicographic index order (where indices are updated 87 // in the temporary array `lexIdx`) or through access pattern expansion 88 // in the innermost loop nest (`expValues` through `expCount`). 89 OpOperand *sparseOut; 90 unsigned outerParNest; 91 Value lexIdx; 92 Value expValues; 93 Value expFilled; 94 Value expAdded; 95 Value expCount; 96 // Current vector length and mask. 97 unsigned curVecLength = 1; 98 Value curVecMask; 99 }; 100 101 } // namespace 102 103 //===----------------------------------------------------------------------===// 104 // Sparse compiler analysis methods. 105 //===----------------------------------------------------------------------===// 106 107 /// Helper method to apply dimension ordering permutation. 108 static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d) { 109 if (enc) { 110 auto order = enc.getDimOrdering(); 111 if (order) { 112 assert(order.isPermutation()); 113 return order.getDimPosition(d); 114 } 115 } 116 return d; 117 } 118 119 /// Helper method to translate dim level type to internal representation. 120 static Dim toDim(const SparseTensorEncodingAttr &enc, unsigned d) { 121 if (enc) { 122 SparseTensorEncodingAttr::DimLevelType tp = enc.getDimLevelType()[d]; 123 if (tp == SparseTensorEncodingAttr::DimLevelType::Compressed) 124 return Dim::kSparse; 125 if (tp == SparseTensorEncodingAttr::DimLevelType::Singleton) 126 return Dim::kSingle; 127 } 128 return Dim::kDense; 129 } 130 131 /// Helper method to inspect affine expressions. Rejects cases where the 132 /// same index is used more than once. Also rejects affine expressions 133 /// that are not a direct index for annotated tensors. 134 // TODO: accept more affine cases for sparse tensors 135 static bool findAffine(Merger &merger, unsigned tensor, AffineExpr a, Dim dim, 136 bool isDense) { 137 switch (a.getKind()) { 138 case AffineExprKind::DimId: { 139 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 140 if (!merger.isDim(tensor, idx, Dim::kUndef)) 141 return false; // used more than once 142 merger.setDim(tensor, idx, dim); 143 return true; 144 } 145 case AffineExprKind::Add: 146 case AffineExprKind::Mul: { 147 if (!isDense) 148 return false; 149 auto binOp = a.cast<AffineBinaryOpExpr>(); 150 return findAffine(merger, tensor, binOp.getLHS(), dim, isDense) && 151 findAffine(merger, tensor, binOp.getRHS(), dim, isDense); 152 } 153 case AffineExprKind::Constant: 154 return isDense; 155 default: 156 return false; 157 } 158 } 159 160 /// Helper method to inspect sparse encodings in the tensor types. 161 /// Fills the per-dimension sparsity information for all tensors. 162 /// Returns true if the sparse annotations and affine subscript 163 /// expressions of all tensors are admissable. Returns false if 164 /// no annotations are found or inadmissable constructs occur. 165 static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) { 166 bool annotated = false; 167 for (OpOperand *t : op.getInputAndOutputOperands()) { 168 auto map = op.getTiedIndexingMap(t); 169 auto enc = getSparseTensorEncoding(t->get().getType()); 170 if (enc) 171 annotated = true; 172 assert(map.getNumResults() == op.getRank(t)); 173 for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { 174 unsigned tensor = t->getOperandNumber(); 175 AffineExpr a = map.getResult(perm(enc, d)); 176 if (!findAffine(merger, tensor, a, toDim(enc, d), !enc)) 177 return false; // inadmissable affine expression 178 } 179 } 180 return annotated; 181 } 182 183 /// A DFS helper to compute a topological sort. Note that recursion is 184 /// bounded by the number of implicit loops, which is always small. 185 /// Returns false when a cycle is detected. 186 static bool topSortDFS(unsigned i, std::vector<unsigned> &visit, 187 std::vector<unsigned> &topSort, 188 std::vector<std::vector<bool>> &adjM) { 189 if (visit[i] != 0) 190 return visit[i] != 1; // 1 denotes cycle! 191 visit[i] = 1; 192 for (unsigned j = 0, e = visit.size(); j < e; j++) 193 if (adjM[i][j]) 194 if (!topSortDFS(j, visit, topSort, adjM)) 195 return false; 196 visit[i] = 2; 197 topSort.push_back(i); 198 return true; 199 } 200 201 /// Helper method to add all constraints from the indices in one affine 202 /// expression before all indices in the other affine expression. For 203 /// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3. 204 static void addAffineOrderings(std::vector<std::vector<bool>> &adjM, 205 AffineExpr a, AffineExpr b, unsigned fidx) { 206 switch (a.getKind()) { 207 case AffineExprKind::DimId: { 208 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 209 if (b) 210 addAffineOrderings(adjM, b, AffineExpr(), idx); 211 else 212 adjM[fidx][idx] = true; 213 break; 214 } 215 case AffineExprKind::Add: 216 case AffineExprKind::Mul: { 217 auto binOp = a.cast<AffineBinaryOpExpr>(); 218 addAffineOrderings(adjM, binOp.getLHS(), b, fidx); 219 addAffineOrderings(adjM, binOp.getRHS(), b, fidx); 220 break; 221 } 222 default: 223 break; 224 } 225 } 226 227 /// Computes a topologically sorted iteration graph for the linalg operation. 228 /// Ensures all tensors are visited in natural index order. This is essential 229 /// for sparse storage formats since these only support access along fixed 230 /// dimensions. Even for dense storage formats, however, the natural index 231 /// order yields innermost unit-stride access with better spatial locality. 232 static bool computeIterationGraph(Merger &merger, linalg::GenericOp op, 233 std::vector<unsigned> &topSort, 234 unsigned mask) { 235 // Set up an n x n from/to adjacency matrix of the iteration graph 236 // for the implicit loop indices i_0 .. i_n-1. 237 unsigned n = op.getNumLoops(); 238 std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false)); 239 240 // Iterate over the indexing maps of every tensor in the tensor expression. 241 for (OpOperand *t : op.getInputAndOutputOperands()) { 242 auto map = op.getTiedIndexingMap(t); 243 auto enc = getSparseTensorEncoding(t->get().getType()); 244 assert(map.getNumDims() == n); 245 // Skip dense tensor constraints when not requested. 246 if (!(mask & SortMask::kIncludeDense) && !enc) 247 continue; 248 // Each tensor expression and optional dimension ordering (row-major 249 // by default) puts an ordering constraint on the loop indices. For 250 // example, the tensor expresion A_ijk forces the ordering i < j < k 251 // on the loop indices if no explicit dimension ordering is given. 252 for (unsigned d = 1, rank = map.getNumResults(); d < rank; d++) { 253 AffineExpr f = map.getResult(perm(enc, d - 1)); 254 AffineExpr t = map.getResult(perm(enc, d)); 255 addAffineOrderings(adjM, f, t, 0); 256 } 257 // Push unrelated loops into sparse iteration space, so these 258 // will be skipped more often. 259 if (mask & SortMask::kIncludeUndef) { 260 unsigned tensor = t->getOperandNumber(); 261 for (unsigned i = 0; i < n; i++) 262 if (merger.isDim(tensor, i, Dim::kSparse)) 263 for (unsigned j = 0; j < n; j++) 264 if (merger.isDim(tensor, j, Dim::kUndef)) 265 adjM[i][j] = true; 266 } 267 } 268 269 // Topologically sort the iteration graph to determine loop order. 270 // Report failure for a cyclic iteration graph. 271 topSort.clear(); 272 topSort.reserve(n); 273 std::vector<unsigned> visit(n, 0); 274 for (unsigned i = 0; i < n; i++) 275 if (visit[i] == 0) 276 if (!topSortDFS(i, visit, topSort, adjM)) 277 return false; // cycle! 278 std::reverse(std::begin(topSort), std::end(topSort)); 279 return true; 280 } 281 282 /// Returns true if tensor has an in-place annotation. 283 static bool isInPlace(Value val) { 284 if (auto arg = val.dyn_cast<BlockArgument>()) 285 if (auto funcOp = dyn_cast<func::FuncOp>(arg.getOwner()->getParentOp())) 286 if (auto attr = funcOp.getArgAttrOfType<BoolAttr>( 287 arg.getArgNumber(), 288 bufferization::BufferizableOpInterface::kInplaceableAttrName)) 289 return attr.getValue(); 290 return false; 291 } 292 293 /// Returns true if tensor materializes uninitialized into the computation. 294 static bool isMaterializing(Value val) { 295 return val.getDefiningOp<linalg::InitTensorOp>() || 296 val.getDefiningOp<bufferization::AllocTensorOp>(); 297 } 298 299 /// Returns true when the tensor expression is admissable for codegen. 300 /// Since all sparse input tensors are admissable, we just need to check 301 /// whether the out tensor in the tensor expression codegen is admissable. 302 /// Sets `sparseOut` to the tensor and `outerParNest` to the outer injective 303 /// nesting depth when a "truly dynamic" sparse tensor output occurs. 304 static bool isAdmissableTensorExp(Merger &merger, linalg::GenericOp op, 305 std::vector<unsigned> &topSort, unsigned exp, 306 OpOperand **sparseOut, 307 unsigned &outerParNest) { 308 OpOperand *lhs = op.getOutputOperand(0); 309 unsigned tensor = lhs->getOperandNumber(); 310 auto enc = getSparseTensorEncoding(lhs->get().getType()); 311 // An non-annotated output tensor is assumed dense, and becomes a random 312 // access n-dim memref. Admissable since insertions cannot occur. 313 if (!enc) 314 return true; 315 // An all-dense annotated "sparse" output tensor becomes a linearized random 316 // access 1-dim memref. Also admissable since insertions cannot occur. 317 bool allDense = true; 318 auto iteratorTypes = op.iterator_types().getValue(); 319 unsigned numLoops = iteratorTypes.size(); 320 for (unsigned i = 0; i < numLoops; i++) 321 if (merger.isDim(tensor, i, Dim::kSparse)) { 322 allDense = false; 323 break; 324 } 325 if (allDense) 326 return true; 327 // A tensor expression with a sparse output tensor that changes its values 328 // but not its nonzero structure, an operation called "simply dynamic" in 329 // [Bik96,Ch9], is also admissable without special codegen, provided 330 // the tensor's underlying sparse storage scheme can be modified in place. 331 if (merger.isSingleCondition(tensor, exp) && isInPlace(lhs->get())) 332 return true; 333 // Accept "truly dynamic" if the output tensor materializes uninitialized 334 // into the computation and insertions occur in lexicographic index order. 335 if (isMaterializing(lhs->get())) { 336 unsigned nest = 0; 337 for (unsigned i = 0; i < numLoops; i++) { 338 if (isReductionIterator(iteratorTypes[topSort[i]])) 339 break; // terminate at first reduction 340 nest++; 341 } 342 // Determine admissable dynamic insertion situations: 343 // (1) fully injective, since there are no reductions, 344 // (2) admissable 1-d expansion in innermost dimension. 345 if (nest >= op.getRank(lhs) - 1) { 346 *sparseOut = lhs; 347 outerParNest = nest; 348 return true; 349 } 350 } 351 return false; 352 } 353 354 //===----------------------------------------------------------------------===// 355 // Sparse compiler synthesis methods (reductions). 356 //===----------------------------------------------------------------------===// 357 358 /// Maps reduction kind to vector::CombiningKind. 359 static vector::CombiningKind getCombiningKind(Reduction kind) { 360 switch (kind) { 361 case kNoReduc: 362 break; 363 case kSum: 364 return vector::CombiningKind::ADD; 365 case kProduct: 366 return vector::CombiningKind::MUL; 367 case kAnd: 368 return vector::CombiningKind::AND; 369 case kOr: 370 return vector::CombiningKind::OR; 371 case kXor: 372 return vector::CombiningKind::XOR; 373 } 374 llvm_unreachable("unknown reduction kind"); 375 } 376 377 /// Maps operation to reduction. 378 static Reduction getReduction(Kind kind) { 379 switch (kind) { 380 case Kind::kAddF: 381 case Kind::kAddC: 382 case Kind::kAddI: 383 case Kind::kSubF: 384 case Kind::kSubC: 385 case Kind::kSubI: 386 return kSum; 387 case Kind::kMulF: 388 case Kind::kMulC: 389 case Kind::kMulI: 390 return kProduct; 391 case Kind::kAndI: 392 return kAnd; 393 case Kind::kOrI: 394 return kOr; 395 case Kind::kXorI: 396 return kXor; 397 default: 398 llvm_unreachable("unexpected reduction operator"); 399 } 400 } 401 402 /// Generates an initial value for a vector reduction, following the scheme 403 /// given in Chapter 5 of "The Software Vectorization Handbook", where the 404 /// initial scalar value is correctly embedded in the vector reduction value, 405 /// and a straightforward horizontal reduction will complete the operation. 406 static Value genVectorReducInit(CodeGen &codegen, OpBuilder &builder, 407 Location loc, VectorType vtp) { 408 Value r = codegen.redVal; 409 switch (codegen.redKind) { 410 case kNoReduc: 411 break; 412 case kSum: 413 case kXor: 414 // Initialize reduction vector to: | 0 | .. | 0 | r | 415 return builder.create<vector::InsertElementOp>( 416 loc, r, constantZero(builder, loc, vtp), 417 constantIndex(builder, loc, 0)); 418 case kProduct: 419 // Initialize reduction vector to: | 1 | .. | 1 | r | 420 return builder.create<vector::InsertElementOp>( 421 loc, r, constantOne(builder, loc, vtp), constantIndex(builder, loc, 0)); 422 case kAnd: 423 case kOr: 424 // Initialize reduction vector to: | r | .. | r | r | 425 return builder.create<vector::BroadcastOp>(loc, vtp, r); 426 } 427 llvm_unreachable("unknown reduction kind"); 428 } 429 430 /// Generates final value for a vector reduction. 431 static Value genVectorReducEnd(CodeGen &codegen, OpBuilder &builder, 432 Location loc, VectorType vtp) { 433 vector::CombiningKind kind = getCombiningKind(codegen.redKind); 434 return builder.create<vector::ReductionOp>(loc, kind, codegen.redVal); 435 } 436 437 /// Updates scalarized reduction value. 438 static void updateReduc(Merger &merger, CodeGen &codegen, Value reduc) { 439 assert(codegen.redKind != kNoReduc); 440 codegen.redVal = merger.exp(codegen.redExp).val = reduc; 441 } 442 443 //===----------------------------------------------------------------------===// 444 // Sparse compiler synthesis methods (statements and expressions). 445 //===----------------------------------------------------------------------===// 446 447 /// Generates buffer for the output tensor. Note that all sparse kernels 448 /// assume that when all elements are written to (viz. x(i) = y(i) * z(i)), 449 /// the output buffer is already initialized to all zeroes and only nonzeroes 450 /// values are computed and written out. For updates (viz. x(i) += y(i) * z(i)), 451 /// only nonzeroes values are used for the updates and no assumption on the 452 /// original contents of the output buffer is necessary.. 453 static Value genOutputBuffer(CodeGen &codegen, OpBuilder &builder, 454 linalg::GenericOp op, MemRefType denseTp, 455 ArrayRef<Value> args) { 456 Location loc = op.getLoc(); 457 Value tensor = op.getOutputOperand(0)->get(); 458 // The output tensor simply could materialize from the buffer that will 459 // be generated for the tensor present in the outs() clause. This has 460 // the major advantage that the sparse kernel only updates the nonzero 461 // positions for the output tensor. 462 if (isInPlace(tensor)) 463 return builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor); 464 // By default, a new buffer is allocated which is initialized to the 465 // tensor defined in the outs() clause. This is always correct but 466 // introduces a dense initialization component that may negatively 467 // impact the running complexity of the sparse kernel. If the tensor 468 // materializes into the computation, we need to preserve the zero 469 // initialization assumption of all sparse output buffers. 470 Value alloc = builder.create<memref::AllocOp>(loc, denseTp, args); 471 if (isMaterializing(tensor)) { 472 Value zero = constantZero(builder, loc, denseTp.getElementType()); 473 builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{alloc}); 474 } else { 475 Value init = 476 builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor); 477 builder.create<memref::CopyOp>(loc, init, alloc); 478 } 479 return alloc; 480 } 481 482 /// Local bufferization of all dense and sparse data structures. 483 /// This code enables testing the first prototype sparse compiler. 484 // TODO: replace this with a proliferated bufferization strategy 485 static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder, 486 linalg::GenericOp op) { 487 Location loc = op.getLoc(); 488 assert(op.getNumInputsAndOutputs() == op.getNumInputs() + 1); 489 // For every tensor, find lower and upper bound on dimensions, set the 490 // same bounds on loop indices, and obtain dense or sparse buffer(s). 491 SmallVector<Value, 4> args; 492 for (OpOperand *t : op.getInputAndOutputOperands()) { 493 unsigned tensor = t->getOperandNumber(); 494 auto shape = op.getShape(t); 495 auto map = op.getTiedIndexingMap(t); 496 auto enc = getSparseTensorEncoding(t->get().getType()); 497 // Scan all dimensions of current tensor. 498 args.clear(); 499 for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { 500 AffineExpr a = map.getResult(perm(enc, d)); 501 if (a.getKind() != AffineExprKind::DimId) 502 continue; // compound 503 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 504 // Handle sparse storage schemes. 505 if (merger.isDim(tensor, idx, Dim::kSparse)) { 506 auto dynShape = {ShapedType::kDynamicSize}; 507 auto ptrTp = 508 MemRefType::get(dynShape, getPointerOverheadType(builder, enc)); 509 auto indTp = 510 MemRefType::get(dynShape, getIndexOverheadType(builder, enc)); 511 Value dim = constantIndex(builder, loc, d); 512 // Generate sparse primitives to obtains pointer and indices. 513 codegen.pointers[tensor][idx] = 514 builder.create<ToPointersOp>(loc, ptrTp, t->get(), dim); 515 codegen.indices[tensor][idx] = 516 builder.create<ToIndicesOp>(loc, indTp, t->get(), dim); 517 } 518 // Find upper bound in current dimension. 519 unsigned p = perm(enc, d); 520 Value up = linalg::createOrFoldDimOp(builder, loc, t->get(), p); 521 if (ShapedType::isDynamic(shape[p])) 522 args.push_back(up); 523 assert(codegen.highs[tensor][idx] == nullptr); 524 codegen.sizes[idx] = codegen.highs[tensor][idx] = up; 525 } 526 // Perform the required bufferization. Dense inputs materialize 527 // from the input tensors. Dense outputs need special handling. 528 // Sparse inputs use sparse primitives to obtain the values. 529 // We also accept in-place all-dense annotated "sparse" outputs. 530 Type elementType = getElementTypeOrSelf(t->get().getType()); 531 if (!enc) { 532 // Non-annotated dense tensors. 533 auto denseTp = MemRefType::get(shape, elementType); 534 if (tensor < op.getNumInputs()) 535 codegen.buffers[tensor] = 536 builder.create<bufferization::ToMemrefOp>(loc, denseTp, t->get()); 537 else 538 codegen.buffers[tensor] = 539 genOutputBuffer(codegen, builder, op, denseTp, args); 540 } else if (t == codegen.sparseOut) { 541 // True sparse output needs a lexIdx array. 542 Value rank = constantIndex(builder, loc, op.getRank(t)); 543 auto dynShape = {ShapedType::kDynamicSize}; 544 auto memTp = MemRefType::get(dynShape, builder.getIndexType()); 545 codegen.lexIdx = builder.create<memref::AllocaOp>(loc, memTp, rank); 546 } else { 547 // Annotated sparse tensors. 548 auto dynShape = {ShapedType::kDynamicSize}; 549 auto sparseTp = MemRefType::get(dynShape, elementType); 550 codegen.buffers[tensor] = 551 builder.create<ToValuesOp>(loc, sparseTp, t->get()); 552 } 553 } 554 } 555 556 /// Constructs vector type. 557 static VectorType vectorType(CodeGen &codegen, Type etp) { 558 unsigned numScalableDims = codegen.options.enableVLAVectorization; 559 return VectorType::get(codegen.curVecLength, etp, numScalableDims); 560 } 561 562 /// Constructs vector type from pointer. 563 static VectorType vectorType(CodeGen &codegen, Value ptr) { 564 return vectorType(codegen, ptr.getType().cast<MemRefType>().getElementType()); 565 } 566 567 /// Constructs vector iteration mask. 568 static Value genVectorMask(CodeGen &codegen, OpBuilder &builder, Value iv, 569 Value lo, Value hi, Value step) { 570 Location loc = iv.getLoc(); 571 VectorType mtp = vectorType(codegen, builder.getI1Type()); 572 // Special case if the vector length evenly divides the trip count (for 573 // example, "for i = 0, 128, 16"). A constant all-true mask is generated 574 // so that all subsequent masked memory operations are immediately folded 575 // into unconditional memory operations. 576 IntegerAttr loInt, hiInt, stepInt; 577 if (matchPattern(lo, m_Constant(&loInt)) && 578 matchPattern(hi, m_Constant(&hiInt)) && 579 matchPattern(step, m_Constant(&stepInt))) { 580 if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0) 581 return builder.create<vector::BroadcastOp>( 582 loc, mtp, constantI1(builder, loc, true)); 583 } 584 // Otherwise, generate a vector mask that avoids overrunning the upperbound 585 // during vector execution. Here we rely on subsequent loop optimizations to 586 // avoid executing the mask in all iterations, for example, by splitting the 587 // loop into an unconditional vector loop and a scalar cleanup loop. 588 auto minMap = AffineMap::get( 589 /*dimCount=*/2, /*symbolCount=*/1, 590 {builder.getAffineSymbolExpr(0), 591 builder.getAffineDimExpr(0) - builder.getAffineDimExpr(1)}, 592 builder.getContext()); 593 Value end = 594 builder.createOrFold<AffineMinOp>(loc, minMap, ValueRange{hi, iv, step}); 595 return builder.create<vector::CreateMaskOp>(loc, mtp, end); 596 } 597 598 /// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi]. 599 static Value genVectorLoad(CodeGen &codegen, OpBuilder &builder, Value ptr, 600 ArrayRef<Value> args) { 601 Location loc = ptr.getLoc(); 602 VectorType vtp = vectorType(codegen, ptr); 603 Value pass = constantZero(builder, loc, vtp); 604 if (args.back().getType().isa<VectorType>()) { 605 SmallVector<Value, 4> scalarArgs(args.begin(), args.end()); 606 Value indexVec = args.back(); 607 scalarArgs.back() = constantIndex(builder, loc, 0); 608 return builder.create<vector::GatherOp>(loc, vtp, ptr, scalarArgs, indexVec, 609 codegen.curVecMask, pass); 610 } 611 return builder.create<vector::MaskedLoadOp>(loc, vtp, ptr, args, 612 codegen.curVecMask, pass); 613 } 614 615 /// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs. 616 static void genVectorStore(CodeGen &codegen, OpBuilder &builder, Value rhs, 617 Value ptr, ArrayRef<Value> args) { 618 Location loc = ptr.getLoc(); 619 if (args.back().getType().isa<VectorType>()) { 620 SmallVector<Value, 4> scalarArgs(args.begin(), args.end()); 621 Value indexVec = args.back(); 622 scalarArgs.back() = constantIndex(builder, loc, 0); 623 builder.create<vector::ScatterOp>(loc, ptr, scalarArgs, indexVec, 624 codegen.curVecMask, rhs); 625 return; 626 } 627 builder.create<vector::MaskedStoreOp>(loc, ptr, args, codegen.curVecMask, 628 rhs); 629 } 630 631 /// Generates a vectorized invariant. Here we rely on subsequent loop 632 /// optimizations to hoist the invariant broadcast out of the vector loop. 633 static Value genVectorInvariantValue(CodeGen &codegen, OpBuilder &builder, 634 Value val) { 635 VectorType vtp = vectorType(codegen, val.getType()); 636 return builder.create<vector::BroadcastOp>(val.getLoc(), vtp, val); 637 } 638 639 /// Generates an affine expression. 640 // 641 // TODO: generalize for sparse tensor subscripts 642 // 643 static Value genAffine(CodeGen &codegen, OpBuilder &builder, AffineExpr a, 644 Location loc) { 645 switch (a.getKind()) { 646 case AffineExprKind::DimId: { 647 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 648 return codegen.loops[idx]; // universal dense index 649 } 650 case AffineExprKind::Add: { 651 auto binOp = a.cast<AffineBinaryOpExpr>(); 652 return builder.create<arith::AddIOp>( 653 loc, genAffine(codegen, builder, binOp.getLHS(), loc), 654 genAffine(codegen, builder, binOp.getRHS(), loc)); 655 } 656 case AffineExprKind::Mul: { 657 auto binOp = a.cast<AffineBinaryOpExpr>(); 658 return builder.create<arith::MulIOp>( 659 loc, genAffine(codegen, builder, binOp.getLHS(), loc), 660 genAffine(codegen, builder, binOp.getRHS(), loc)); 661 } 662 case AffineExprKind::Constant: { 663 int64_t c = a.cast<AffineConstantExpr>().getValue(); 664 return constantIndex(builder, loc, c); 665 } 666 default: 667 llvm_unreachable("unexpected affine subscript"); 668 } 669 } 670 671 /// Generates index for load/store on sparse tensor. 672 static Value genIndex(CodeGen &codegen, linalg::GenericOp op, OpOperand *t) { 673 auto map = op.getTiedIndexingMap(t); 674 auto enc = getSparseTensorEncoding(t->get().getType()); 675 AffineExpr a = map.getResult(perm(enc, map.getNumResults() - 1)); 676 assert(a.getKind() == AffineExprKind::DimId); 677 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 678 return codegen.loops[idx]; 679 } 680 681 /// Generates subscript for load/store on a dense or sparse tensor. 682 static Value genSubscript(CodeGen &codegen, OpBuilder &builder, 683 linalg::GenericOp op, OpOperand *t, 684 SmallVector<Value, 4> &args) { 685 unsigned tensor = t->getOperandNumber(); 686 auto map = op.getTiedIndexingMap(t); 687 auto enc = getSparseTensorEncoding(t->get().getType()); 688 unsigned rank = map.getNumResults(); 689 if (enc) { 690 // Note that currently, all sparse subscripts are simple. 691 // TODO: accept affine too? 692 AffineExpr a = map.getResult(perm(enc, rank - 1)); 693 assert(a.getKind() == AffineExprKind::DimId); 694 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 695 assert(codegen.pidxs[tensor][idx] != nullptr); 696 args.push_back(codegen.pidxs[tensor][idx]); // position index 697 } else { 698 for (unsigned d = 0; d < rank; d++) { 699 AffineExpr a = map.getResult(perm(enc, d)); 700 args.push_back(genAffine(codegen, builder, a, op.getLoc())); 701 } 702 } 703 return codegen.buffers[tensor]; 704 } 705 706 /// Generates insertion code to implement dynamic tensor load. 707 static Value genInsertionLoad(CodeGen &codegen, OpBuilder &builder, 708 linalg::GenericOp op, OpOperand *t) { 709 Location loc = op.getLoc(); 710 // Direct lexicographic index order, tensor loads as zero. 711 if (!codegen.expValues) { 712 Type tp = getElementTypeOrSelf(t->get().getType()); 713 return constantZero(builder, loc, tp); 714 } 715 // Load from expanded access pattern. 716 Value index = genIndex(codegen, op, t); 717 return builder.create<memref::LoadOp>(loc, codegen.expValues, index); 718 } 719 720 /// Generates insertion code to implement dynamic tensor store. 721 static void genInsertionStore(CodeGen &codegen, OpBuilder &builder, 722 linalg::GenericOp op, OpOperand *t, Value rhs) { 723 Location loc = op.getLoc(); 724 // Direct insertion in lexicographic index order. 725 if (!codegen.expValues) { 726 builder.create<LexInsertOp>(loc, t->get(), codegen.lexIdx, rhs); 727 return; 728 } 729 // Generates insertion code along expanded access pattern. 730 // if (!expFilled[i]) then 731 // expFilled[i] = true 732 // expAdded[inserts++] = i 733 // endif 734 // values[i] = rhs 735 Value index = genIndex(codegen, op, t); 736 Value fval = constantI1(builder, loc, false); 737 Value tval = constantI1(builder, loc, true); 738 // If statement. 739 Value filled = builder.create<memref::LoadOp>(loc, codegen.expFilled, index); 740 Value cond = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, 741 filled, fval); 742 scf::IfOp ifOp = builder.create<scf::IfOp>(loc, builder.getIndexType(), cond, 743 /*else=*/true); 744 // True branch. 745 builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); 746 builder.create<memref::StoreOp>(loc, tval, codegen.expFilled, index); 747 builder.create<memref::StoreOp>(loc, index, codegen.expAdded, 748 codegen.expCount); 749 Value one = constantIndex(builder, loc, 1); 750 Value add = builder.create<arith::AddIOp>(loc, codegen.expCount, one); 751 builder.create<scf::YieldOp>(loc, add); 752 // False branch. 753 builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); 754 builder.create<scf::YieldOp>(loc, codegen.expCount); 755 builder.setInsertionPointAfter(ifOp); 756 // Value assignment. 757 codegen.expCount = ifOp.getResult(0); 758 builder.create<memref::StoreOp>(loc, rhs, codegen.expValues, index); 759 } 760 761 /// Generates a load on a dense or sparse tensor. 762 static Value genTensorLoad(Merger &merger, CodeGen &codegen, OpBuilder &builder, 763 linalg::GenericOp op, unsigned exp) { 764 // Test if the load was hoisted to a higher loop nest. 765 Value val = merger.exp(exp).val; 766 if (val) { 767 if (codegen.curVecLength > 1 && !val.getType().isa<VectorType>()) 768 return genVectorInvariantValue(codegen, builder, val); 769 return val; 770 } 771 // Load during insertion. 772 OpOperand *t = op.getInputAndOutputOperands()[merger.exp(exp).tensor]; 773 if (t == codegen.sparseOut) 774 return genInsertionLoad(codegen, builder, op, t); 775 // Actual load. 776 SmallVector<Value, 4> args; 777 Value ptr = genSubscript(codegen, builder, op, t, args); 778 if (codegen.curVecLength > 1) 779 return genVectorLoad(codegen, builder, ptr, args); 780 return builder.create<memref::LoadOp>(op.getLoc(), ptr, args); 781 } 782 783 /// Generates a store on a dense or sparse tensor. 784 static void genTensorStore(Merger &merger, CodeGen &codegen, OpBuilder &builder, 785 linalg::GenericOp op, unsigned exp, Value rhs) { 786 Location loc = op.getLoc(); 787 // Test if this is a scalarized reduction. 788 if (codegen.redVal) { 789 if (codegen.curVecLength > 1) 790 rhs = builder.create<arith::SelectOp>(loc, codegen.curVecMask, rhs, 791 codegen.redVal); 792 updateReduc(merger, codegen, rhs); 793 return; 794 } 795 // Store during insertion. 796 OpOperand *t = op.getOutputOperand(0); 797 if (t == codegen.sparseOut) { 798 if (!rhs) { 799 // Only unary and binary are allowed to return uninitialized rhs 800 // to indicate missing output. 801 assert(merger.exp(exp).kind == kUnary || merger.exp(exp).kind == kBinary); 802 } else { 803 genInsertionStore(codegen, builder, op, t, rhs); 804 } 805 return; 806 } 807 // Actual store. 808 SmallVector<Value, 4> args; 809 Value ptr = genSubscript(codegen, builder, op, t, args); 810 if (codegen.curVecLength > 1) 811 genVectorStore(codegen, builder, rhs, ptr, args); 812 else 813 builder.create<memref::StoreOp>(loc, rhs, ptr, args); 814 } 815 816 /// Generates a pointer/index load from the sparse storage scheme. Narrower 817 /// data types need to be zero extended before casting the value into the 818 /// index type used for looping and indexing. 819 static Value genLoad(CodeGen &codegen, OpBuilder &builder, Location loc, 820 Value ptr, Value s) { 821 // See https://llvm.org/docs/GetElementPtr.html for some background on 822 // the complications described below. 823 if (codegen.curVecLength > 1) { 824 // Since the index vector is used in a subsequent gather/scatter operations, 825 // which effectively defines an unsigned pointer + signed index, we must 826 // zero extend the vector to an index width. For 8-bit and 16-bit values, 827 // an 32-bit index width suffices. For 32-bit values, zero extending the 828 // elements into 64-bit loses some performance since the 32-bit indexed 829 // gather/scatter is more efficient than the 64-bit index variant (if the 830 // negative 32-bit index space is unused, the enableSIMDIndex32 flag can 831 // preserve this performance). For 64-bit values, there is no good way 832 // to state that the indices are unsigned, with creates the potential of 833 // incorrect address calculations in the unlikely case we need such 834 // extremely large offsets. 835 Type etp = ptr.getType().cast<MemRefType>().getElementType(); 836 Value vload = genVectorLoad(codegen, builder, ptr, {s}); 837 if (!etp.isa<IndexType>()) { 838 if (etp.getIntOrFloatBitWidth() < 32) 839 vload = builder.create<arith::ExtUIOp>( 840 loc, vectorType(codegen, builder.getI32Type()), vload); 841 else if (etp.getIntOrFloatBitWidth() < 64 && 842 !codegen.options.enableSIMDIndex32) 843 vload = builder.create<arith::ExtUIOp>( 844 loc, vectorType(codegen, builder.getI64Type()), vload); 845 } 846 return vload; 847 } 848 // For the scalar case, we simply zero extend narrower indices into 64-bit 849 // values before casting to index without a performance penalty. Here too, 850 // however, indices that already are 64-bit, in theory, cannot express the 851 // full range as explained above. 852 Value load = builder.create<memref::LoadOp>(loc, ptr, s); 853 if (!load.getType().isa<IndexType>()) { 854 if (load.getType().getIntOrFloatBitWidth() < 64) 855 load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load); 856 load = 857 builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load); 858 } 859 return load; 860 } 861 862 /// Generates an invariant value. 863 static Value genInvariantValue(Merger &merger, CodeGen &codegen, 864 OpBuilder &builder, unsigned exp) { 865 Value val = merger.exp(exp).val; 866 if (codegen.curVecLength > 1) 867 return genVectorInvariantValue(codegen, builder, val); 868 return val; 869 } 870 871 /// Generates an address computation "sz * p + i". 872 static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc, 873 Value size, Value p, Value i) { 874 Value mul = builder.create<arith::MulIOp>(loc, size, p); 875 if (auto vtp = i.getType().dyn_cast<VectorType>()) { 876 Value inv = 877 builder.create<arith::IndexCastOp>(loc, vtp.getElementType(), mul); 878 mul = genVectorInvariantValue(codegen, builder, inv); 879 } 880 return builder.create<arith::AddIOp>(loc, mul, i); 881 } 882 883 /// Generates an index value. 884 static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx, 885 unsigned ldx) { 886 Value ival = codegen.loops[idx]; 887 Type itype = ival.getType(); 888 // During vectorization, we either encounter: 889 // (1) indices already in vector form, as in ... = ind[lo:hi], good to go, or 890 // (2) single index, as in ... = i, must convert to [i, i+1, ...] for inner i. 891 unsigned vl = codegen.curVecLength; 892 if (vl > 1 && !itype.isa<VectorType>()) { 893 Location loc = ival.getLoc(); 894 VectorType vtp = vectorType(codegen, itype); 895 ival = builder.create<vector::BroadcastOp>(loc, vtp, ival); 896 if (idx == ldx) { 897 Value incr; 898 if (vtp.isScalable()) { 899 Type stepvty = vectorType(codegen, builder.getI64Type()); 900 Value stepv = builder.create<LLVM::StepVectorOp>(loc, stepvty); 901 incr = builder.create<arith::IndexCastOp>(loc, vtp, stepv); 902 } else { 903 SmallVector<APInt, 4> integers; 904 for (unsigned i = 0; i < vl; i++) 905 integers.push_back(APInt(/*width=*/64, i)); 906 auto values = DenseElementsAttr::get(vtp, integers); 907 incr = builder.create<arith::ConstantOp>(loc, vtp, values); 908 } 909 ival = builder.create<arith::AddIOp>(loc, ival, incr); 910 } 911 } 912 return ival; 913 } 914 915 /// Semi-ring branches are simply inlined by the sparse compiler. Prior 916 /// analysis has verified that all computations are "local" to the inlined 917 /// branch or otherwise invariantly defined outside the loop nest, with the 918 /// exception of index computations, which need to be relinked to actual 919 /// inlined cloned code. 920 static Value relinkBranch(CodeGen &codegen, RewriterBase &rewriter, 921 Block *block, Value e, unsigned ldx) { 922 if (Operation *def = e.getDefiningOp()) { 923 if (auto indexOp = dyn_cast<linalg::IndexOp>(def)) 924 return genIndexValue(codegen, rewriter, indexOp.dim(), ldx); 925 if (def->getBlock() == block) { 926 for (unsigned i = 0, n = def->getNumOperands(); i < n; i++) 927 def->setOperand( 928 i, relinkBranch(codegen, rewriter, block, def->getOperand(i), ldx)); 929 } 930 } 931 return e; 932 } 933 934 /// Recursively generates tensor expression. 935 static Value genExp(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, 936 linalg::GenericOp op, unsigned exp, unsigned ldx) { 937 Location loc = op.getLoc(); 938 if (exp == -1u) 939 return Value(); 940 if (merger.exp(exp).kind == Kind::kTensor) 941 return genTensorLoad(merger, codegen, rewriter, op, exp); 942 if (merger.exp(exp).kind == Kind::kInvariant) 943 return genInvariantValue(merger, codegen, rewriter, exp); 944 if (merger.exp(exp).kind == Kind::kIndex) 945 return genIndexValue(codegen, rewriter, merger.exp(exp).index, ldx); 946 Value v0 = 947 genExp(merger, codegen, rewriter, op, merger.exp(exp).children.e0, ldx); 948 Value v1 = 949 genExp(merger, codegen, rewriter, op, merger.exp(exp).children.e1, ldx); 950 Value ee = merger.buildExp(rewriter, loc, exp, v0, v1); 951 if (ee && (merger.exp(exp).kind == Kind::kUnary || 952 merger.exp(exp).kind == Kind::kBinary || 953 merger.exp(exp).kind == Kind::kBinaryBranch)) 954 ee = relinkBranch(codegen, rewriter, ee.getParentBlock(), ee, ldx); 955 return ee; 956 } 957 958 /// Determines if affine expression is invariant. 959 static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a, 960 unsigned ldx, bool &atLevel) { 961 switch (a.getKind()) { 962 case AffineExprKind::DimId: { 963 unsigned idx = a.cast<AffineDimExpr>().getPosition(); 964 if (idx == ldx) 965 atLevel = true; 966 return codegen.loops[idx] != nullptr; // no longer in play? 967 } 968 case AffineExprKind::Add: 969 case AffineExprKind::Mul: { 970 auto binOp = a.cast<AffineBinaryOpExpr>(); 971 return isInvariantAffine(codegen, binOp.getLHS(), ldx, atLevel) && 972 isInvariantAffine(codegen, binOp.getRHS(), ldx, atLevel); 973 } 974 default: 975 return true; 976 } 977 } 978 979 /// Hoists loop invariant tensor loads for which indices have been exhausted. 980 static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder, 981 linalg::GenericOp op, unsigned exp, unsigned ldx, 982 bool atStart, Kind last = Kind::kTensor) { 983 if (exp == -1u) 984 return; 985 if (merger.exp(exp).kind == Kind::kTensor) { 986 // Inspect tensor indices. 987 bool atLevel = ldx == -1u; 988 OpOperand *t = op.getInputAndOutputOperands()[merger.exp(exp).tensor]; 989 auto map = op.getTiedIndexingMap(t); 990 auto enc = getSparseTensorEncoding(t->get().getType()); 991 for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { 992 AffineExpr a = map.getResult(perm(enc, d)); 993 if (!isInvariantAffine(codegen, a, ldx, atLevel)) 994 return; // still in play 995 } 996 // All exhausted at this level (atLevel denotes exactly at this level). 997 if (!atLevel) 998 return; 999 OpOperand *lhs = op.getOutputOperand(0); 1000 if (lhs == t) { 1001 // Start or end a scalarized reduction 1002 if (atStart) { 1003 Value load = genTensorLoad(merger, codegen, builder, op, exp); 1004 codegen.redKind = getReduction(last); 1005 codegen.redExp = exp; 1006 updateReduc(merger, codegen, load); 1007 } else { 1008 Value redVal = codegen.redVal; 1009 updateReduc(merger, codegen, Value()); 1010 codegen.redExp = -1u; 1011 codegen.redKind = kNoReduc; 1012 genTensorStore(merger, codegen, builder, op, exp, redVal); 1013 } 1014 } else { 1015 // Start or end loop invariant hoisting of a tensor load. 1016 merger.exp(exp).val = 1017 atStart ? genTensorLoad(merger, codegen, builder, op, exp) : Value(); 1018 } 1019 } else if (merger.exp(exp).kind != Kind::kInvariant && 1020 merger.exp(exp).kind != Kind::kIndex) { 1021 // Traverse into the binary operations. Note that we only hoist 1022 // tensor loads, since subsequent MLIR/LLVM passes know how to 1023 // deal with all other kinds of derived loop invariants. 1024 Kind last = merger.exp(exp).kind; 1025 unsigned e0 = merger.exp(exp).children.e0; 1026 unsigned e1 = merger.exp(exp).children.e1; 1027 genInvariants(merger, codegen, builder, op, e0, ldx, atStart, last); 1028 genInvariants(merger, codegen, builder, op, e1, ldx, atStart, last); 1029 } 1030 } 1031 1032 /// Generates an expanded access pattern in innermost dimension. 1033 static void genExpansion(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1034 linalg::GenericOp op, unsigned at, bool atStart) { 1035 OpOperand *lhs = codegen.sparseOut; 1036 if (!lhs || codegen.outerParNest != op.getRank(lhs) - 1 || 1037 at != codegen.outerParNest) 1038 return; // not needed at this level 1039 // Generate start or end of an expanded access pattern. 1040 Value tensor = lhs->get(); 1041 Location loc = op.getLoc(); 1042 if (atStart) { 1043 auto dynShape = {ShapedType::kDynamicSize}; 1044 Type etp = tensor.getType().cast<ShapedType>().getElementType(); 1045 Type t1 = MemRefType::get(dynShape, etp); 1046 Type t2 = MemRefType::get(dynShape, builder.getI1Type()); 1047 Type t3 = MemRefType::get(dynShape, builder.getIndexType()); 1048 Type t4 = builder.getIndexType(); 1049 auto res = 1050 builder.create<ExpandOp>(loc, TypeRange({t1, t2, t3, t4}), tensor); 1051 assert(res.getNumResults() == 4); 1052 assert(!codegen.expValues); 1053 codegen.expValues = res.getResult(0); 1054 codegen.expFilled = res.getResult(1); 1055 codegen.expAdded = res.getResult(2); 1056 codegen.expCount = res.getResult(3); 1057 } else { 1058 assert(codegen.expValues); 1059 builder.create<CompressOp>(loc, tensor, codegen.lexIdx, codegen.expValues, 1060 codegen.expFilled, codegen.expAdded, 1061 codegen.expCount); 1062 codegen.expValues = codegen.expFilled = codegen.expAdded = 1063 codegen.expCount = Value(); 1064 } 1065 } 1066 1067 /// Generates initialization code for the subsequent loop sequence at 1068 /// current index level. Returns true if the loop sequence needs to 1069 /// maintain the universal index. 1070 static bool genInit(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1071 linalg::GenericOp op, std::vector<unsigned> &topSort, 1072 unsigned at, BitVector &inits) { 1073 bool needsUniv = false; 1074 Location loc = op.getLoc(); 1075 unsigned idx = topSort[at]; 1076 1077 // Initialize sparse positions. 1078 for (unsigned b = 0, be = inits.size(); b < be; b++) { 1079 if (inits[b]) { 1080 unsigned tensor = merger.tensor(b); 1081 assert(idx == merger.index(b)); 1082 if (merger.isDim(b, Dim::kSparse)) { 1083 // Initialize sparse index. 1084 unsigned pat = at; 1085 for (; pat != 0; pat--) { 1086 if (codegen.pidxs[tensor][topSort[pat - 1]]) 1087 break; 1088 } 1089 Value ptr = codegen.pointers[tensor][idx]; 1090 Value one = constantIndex(builder, loc, 1); 1091 Value p0 = (pat == 0) ? constantIndex(builder, loc, 0) 1092 : codegen.pidxs[tensor][topSort[pat - 1]]; 1093 codegen.pidxs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p0); 1094 Value p1 = builder.create<arith::AddIOp>(loc, p0, one); 1095 codegen.highs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p1); 1096 } else { 1097 // Dense index still in play. 1098 needsUniv = true; 1099 } 1100 } 1101 } 1102 1103 // Initialize the universal dense index. 1104 codegen.loops[idx] = constantIndex(builder, loc, 0); 1105 return needsUniv; 1106 } 1107 1108 /// Returns vectorization strategy. Any implicit inner loop in the Linalg 1109 /// operation is a candidate. Whether it is actually converted to SIMD code 1110 /// depends on the requested strategy. 1111 static bool isVectorFor(CodeGen &codegen, bool isInner, bool isReduction, 1112 bool isSparse) { 1113 // Reject vectorization of sparse output, unless innermost is reduction. 1114 if (codegen.sparseOut && !isReduction) 1115 return false; 1116 // Inspect strategy. 1117 switch (codegen.options.vectorizationStrategy) { 1118 case SparseVectorizationStrategy::kNone: 1119 return false; 1120 case SparseVectorizationStrategy::kDenseInnerLoop: 1121 return isInner && !isSparse; 1122 case SparseVectorizationStrategy::kAnyStorageInnerLoop: 1123 return isInner; 1124 } 1125 llvm_unreachable("unexpected vectorization strategy"); 1126 } 1127 1128 /// Returns parallelization strategy. Any implicit loop in the Linalg operation 1129 /// that is marked "parallel" is a candidate. Whether it is actually converted 1130 /// to a parallel operation depends on the requested strategy. 1131 static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction, 1132 bool isSparse, bool isVector) { 1133 // Reject parallelization of sparse output. 1134 if (codegen.sparseOut) 1135 return false; 1136 // Inspect strategy. 1137 switch (codegen.options.parallelizationStrategy) { 1138 case SparseParallelizationStrategy::kNone: 1139 return false; 1140 case SparseParallelizationStrategy::kDenseOuterLoop: 1141 return isOuter && !isSparse && !isReduction && !isVector; 1142 case SparseParallelizationStrategy::kAnyStorageOuterLoop: 1143 return isOuter && !isReduction && !isVector; 1144 case SparseParallelizationStrategy::kDenseAnyLoop: 1145 return !isSparse && !isReduction && !isVector; 1146 case SparseParallelizationStrategy::kAnyStorageAnyLoop: 1147 return !isReduction && !isVector; 1148 } 1149 llvm_unreachable("unexpected parallelization strategy"); 1150 } 1151 1152 /// Checks unit stride for dense tensors. The iteration graph may have ignored 1153 /// dense access patterns in order to avoid cycles (sparse access patterns are 1154 /// always placed innermost), but that means dense access has become strided. 1155 /// This prevents effective vectorization. 1156 static bool denseUnitStrides(Merger &merger, linalg::GenericOp op, 1157 unsigned idx) { 1158 for (OpOperand *t : op.getInputAndOutputOperands()) { 1159 if (!getSparseTensorEncoding(t->get().getType())) { 1160 auto map = op.getTiedIndexingMap(t); 1161 for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) { 1162 AffineExpr a = map.getResult(d); 1163 // Report non-unit stride if innermost index appears at an outer 1164 // dimension (true non-unit stride) or if the innermost index appears 1165 // in a compound subscript in the innermost dimension. Even if the 1166 // latter is unit stride, it does not play well with scatter/gather. 1167 // TODO: accept unit stride affine innermost like a[i,j+k+1]? 1168 if (a.isFunctionOfDim(idx) && 1169 ((d != rank - 1) || (a.getKind() != AffineExprKind::DimId))) 1170 return false; 1171 } 1172 } 1173 } 1174 return true; 1175 } 1176 1177 /// Generates a for-loop on a single index. 1178 static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1179 linalg::GenericOp op, bool isOuter, bool isInner, 1180 unsigned idx, BitVector &indices) { 1181 unsigned fb = indices.find_first(); 1182 unsigned tensor = merger.tensor(fb); 1183 assert(idx == merger.index(fb)); 1184 auto iteratorTypes = op.iterator_types().getValue(); 1185 bool isReduction = isReductionIterator(iteratorTypes[idx]); 1186 bool isSparse = merger.isDim(fb, Dim::kSparse); 1187 bool isVector = isVectorFor(codegen, isInner, isReduction, isSparse) && 1188 denseUnitStrides(merger, op, idx); 1189 bool isParallel = 1190 isParallelFor(codegen, isOuter, isReduction, isSparse, isVector); 1191 1192 // Prepare vector length. 1193 if (isVector) 1194 codegen.curVecLength = codegen.options.vectorLength; 1195 1196 // Loop bounds and increment. 1197 Location loc = op.getLoc(); 1198 Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx]; 1199 Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx]; 1200 Value step = constantIndex(builder, loc, codegen.curVecLength); 1201 if (isVector && codegen.options.enableVLAVectorization) { 1202 Value vscale = builder.create<vector::VectorScaleOp>( 1203 loc, IndexType::get(builder.getContext())); 1204 step = builder.create<arith::MulIOp>(loc, vscale, step); 1205 } 1206 1207 // Emit a parallel loop. 1208 if (isParallel) { 1209 assert(!isVector); 1210 scf::ParallelOp parOp = builder.create<scf::ParallelOp>(loc, lo, hi, step); 1211 if (isSparse) 1212 codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0]; 1213 else 1214 codegen.loops[idx] = parOp.getInductionVars()[0]; 1215 builder.setInsertionPointToStart(parOp.getBody()); 1216 return parOp; 1217 } 1218 1219 // Emit a sequential or vector loop. 1220 SmallVector<Value, 4> operands; 1221 if (codegen.redVal) { 1222 // In a vector loop, bring reduction into SIMD form, if not already. 1223 if (isVector && !codegen.redVal.getType().isa<VectorType>()) { 1224 VectorType vtp = vectorType(codegen, codegen.redVal.getType()); 1225 Value vred = genVectorReducInit(codegen, builder, loc, vtp); 1226 updateReduc(merger, codegen, vred); 1227 } 1228 operands.push_back(codegen.redVal); 1229 } 1230 if (codegen.expValues) 1231 operands.push_back(codegen.expCount); 1232 scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, operands); 1233 if (codegen.redVal) 1234 updateReduc(merger, codegen, forOp.getRegionIterArgs().front()); 1235 if (codegen.expValues) 1236 codegen.expCount = forOp.getRegionIterArgs().back(); 1237 // Assign induction variable to sparse or dense index. 1238 Value iv = forOp.getInductionVar(); 1239 if (isSparse) 1240 codegen.pidxs[tensor][idx] = iv; 1241 else 1242 codegen.loops[idx] = iv; 1243 builder.setInsertionPointToStart(forOp.getBody()); 1244 // Share vector iteration mask between all subsequent loads/stores. 1245 if (isVector) 1246 codegen.curVecMask = genVectorMask(codegen, builder, iv, lo, hi, step); 1247 return forOp; 1248 } 1249 1250 /// Emit a while-loop for co-iteration over multiple indices. 1251 static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1252 linalg::GenericOp op, unsigned idx, bool needsUniv, 1253 BitVector &indices) { 1254 SmallVector<Type, 4> types; 1255 SmallVector<Value, 4> operands; 1256 // Construct the while-loop with a parameter for each index. 1257 Type indexType = builder.getIndexType(); 1258 for (unsigned b = 0, be = indices.size(); b < be; b++) { 1259 if (indices[b] && merger.isDim(b, Dim::kSparse)) { 1260 unsigned tensor = merger.tensor(b); 1261 assert(idx == merger.index(b)); 1262 types.push_back(indexType); 1263 operands.push_back(codegen.pidxs[tensor][idx]); 1264 } 1265 } 1266 if (codegen.redVal) { 1267 types.push_back(codegen.redVal.getType()); 1268 operands.push_back(codegen.redVal); 1269 } 1270 if (codegen.expValues) { 1271 types.push_back(indexType); 1272 operands.push_back(codegen.expCount); 1273 } 1274 if (needsUniv) { 1275 types.push_back(indexType); 1276 operands.push_back(codegen.loops[idx]); 1277 } 1278 assert(types.size() == operands.size()); 1279 Location loc = op.getLoc(); 1280 scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands); 1281 1282 SmallVector<Location> locs(types.size(), loc); 1283 Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); 1284 Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); 1285 1286 // Build the "before" region, which effectively consists 1287 // of a conjunction of "i < upper" tests on all induction. 1288 builder.setInsertionPointToStart(&whileOp.getBefore().front()); 1289 Value cond; 1290 unsigned o = 0; 1291 for (unsigned b = 0, be = indices.size(); b < be; b++) { 1292 if (indices[b] && merger.isDim(b, Dim::kSparse)) { 1293 unsigned tensor = merger.tensor(b); 1294 assert(idx == merger.index(b)); 1295 Value op1 = before->getArgument(o); 1296 Value op2 = codegen.highs[tensor][idx]; 1297 Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, 1298 op1, op2); 1299 cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc; 1300 codegen.pidxs[tensor][idx] = after->getArgument(o++); 1301 } 1302 } 1303 if (codegen.redVal) 1304 updateReduc(merger, codegen, after->getArgument(o++)); 1305 if (codegen.expValues) 1306 codegen.expCount = after->getArgument(o++); 1307 if (needsUniv) 1308 codegen.loops[idx] = after->getArgument(o++); 1309 assert(o == operands.size()); 1310 builder.create<scf::ConditionOp>(loc, cond, before->getArguments()); 1311 builder.setInsertionPointToStart(&whileOp.getAfter().front()); 1312 return whileOp; 1313 } 1314 1315 /// Generates a for-loop or a while-loop, depending on whether it implements 1316 /// singleton iteration or co-iteration over the given conjunction. 1317 static Operation *genLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1318 linalg::GenericOp op, std::vector<unsigned> &topSort, 1319 unsigned at, bool needsUniv, BitVector &indices) { 1320 unsigned idx = topSort[at]; 1321 if (indices.count() == 1) { 1322 bool isOuter = at == 0; 1323 bool isInner = at == topSort.size() - 1; 1324 return genFor(merger, codegen, builder, op, isOuter, isInner, idx, indices); 1325 } 1326 return genWhile(merger, codegen, builder, op, idx, needsUniv, indices); 1327 } 1328 1329 /// Generates the local variables for this loop, consisting of the sparse 1330 /// indices, restored universal dense index, and dense positions. 1331 static void genLocals(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1332 linalg::GenericOp op, std::vector<unsigned> &topSort, 1333 unsigned at, bool needsUniv, BitVector &locals) { 1334 Location loc = op.getLoc(); 1335 unsigned idx = topSort[at]; 1336 1337 // Initialize sparse indices. 1338 Value min; 1339 for (unsigned b = 0, be = locals.size(); b < be; b++) { 1340 if (locals[b] && merger.isDim(b, Dim::kSparse)) { 1341 unsigned tensor = merger.tensor(b); 1342 assert(idx == merger.index(b)); 1343 Value ptr = codegen.indices[tensor][idx]; 1344 Value s = codegen.pidxs[tensor][idx]; 1345 Value load = genLoad(codegen, builder, loc, ptr, s); 1346 codegen.idxs[tensor][idx] = load; 1347 if (!needsUniv) { 1348 if (min) { 1349 Value cmp = builder.create<arith::CmpIOp>( 1350 loc, arith::CmpIPredicate::ult, load, min); 1351 min = builder.create<arith::SelectOp>(loc, cmp, load, min); 1352 } else { 1353 min = load; 1354 } 1355 } 1356 } 1357 } 1358 1359 // Merge dense universal index over minimum. 1360 if (min) { 1361 assert(!needsUniv); 1362 codegen.loops[idx] = min; 1363 } 1364 1365 // Initialize dense positions. Note that we generate dense indices of the 1366 // output tensor unconditionally, since they may not appear in the lattice, 1367 // but may be needed for linearized codegen. 1368 for (unsigned b = 0, be = locals.size(); b < be; b++) { 1369 if ((locals[b] || merger.isOutTensor(b, idx)) && 1370 merger.isDim(b, Dim::kDense)) { 1371 unsigned tensor = merger.tensor(b); 1372 assert(idx == merger.index(b)); 1373 unsigned pat = at; 1374 for (; pat != 0; pat--) 1375 if (codegen.pidxs[tensor][topSort[pat - 1]]) 1376 break; 1377 Value p = (pat == 0) ? constantIndex(builder, loc, 0) 1378 : codegen.pidxs[tensor][topSort[pat - 1]]; 1379 codegen.pidxs[tensor][idx] = genAddress( 1380 codegen, builder, loc, codegen.sizes[idx], p, codegen.loops[idx]); 1381 } 1382 } 1383 1384 // Move the insertion indices in lexicographic index order. During access 1385 // pattern expansion, we can skip setting the innermost dimension. 1386 if (codegen.sparseOut && !codegen.expValues) { 1387 Value pos = constantIndex(builder, loc, at); 1388 builder.create<memref::StoreOp>(loc, codegen.loops[idx], codegen.lexIdx, 1389 pos); 1390 } 1391 } 1392 1393 /// Generates the induction structure for a while-loop. 1394 static void genWhileInduction(Merger &merger, CodeGen &codegen, 1395 OpBuilder &builder, linalg::GenericOp op, 1396 unsigned idx, bool needsUniv, 1397 BitVector &induction, scf::WhileOp whileOp) { 1398 Location loc = op.getLoc(); 1399 // Finalize each else branch of all if statements. 1400 if (codegen.redVal || codegen.expValues) { 1401 while (auto ifOp = dyn_cast_or_null<scf::IfOp>( 1402 builder.getInsertionBlock()->getParentOp())) { 1403 unsigned y = 0; 1404 SmallVector<Value, 4> yields; 1405 if (codegen.redVal) { 1406 yields.push_back(codegen.redVal); 1407 updateReduc(merger, codegen, ifOp.getResult(y++)); 1408 } 1409 if (codegen.expValues) { 1410 yields.push_back(codegen.expCount); 1411 codegen.expCount = ifOp->getResult(y++); 1412 } 1413 assert(y == yields.size()); 1414 builder.create<scf::YieldOp>(loc, yields); 1415 builder.setInsertionPointAfter(ifOp); 1416 } 1417 } 1418 builder.setInsertionPointToEnd(&whileOp.getAfter().front()); 1419 // Finalize the induction. Note that the induction could be performed 1420 // in the individual if-branches to avoid re-evaluating the conditions. 1421 // However, that would result in a rather elaborate forest of yield 1422 // instructions during code generation. Moreover, performing the induction 1423 // after the if-statements more closely resembles code generated by TACO. 1424 unsigned o = 0; 1425 SmallVector<Value, 4> operands; 1426 Value one = constantIndex(builder, loc, 1); 1427 for (unsigned b = 0, be = induction.size(); b < be; b++) { 1428 if (induction[b] && merger.isDim(b, Dim::kSparse)) { 1429 unsigned tensor = merger.tensor(b); 1430 assert(idx == merger.index(b)); 1431 Value op1 = codegen.idxs[tensor][idx]; 1432 Value op2 = codegen.loops[idx]; 1433 Value op3 = codegen.pidxs[tensor][idx]; 1434 Value cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, 1435 op1, op2); 1436 Value add = builder.create<arith::AddIOp>(loc, op3, one); 1437 operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3)); 1438 codegen.pidxs[tensor][idx] = whileOp->getResult(o++); 1439 } 1440 } 1441 if (codegen.redVal) { 1442 operands.push_back(codegen.redVal); 1443 updateReduc(merger, codegen, whileOp->getResult(o++)); 1444 } 1445 if (codegen.expValues) { 1446 operands.push_back(codegen.expCount); 1447 codegen.expCount = whileOp->getResult(o++); 1448 } 1449 if (needsUniv) { 1450 operands.push_back( 1451 builder.create<arith::AddIOp>(loc, codegen.loops[idx], one)); 1452 codegen.loops[idx] = whileOp->getResult(o++); 1453 } 1454 assert(o == operands.size()); 1455 builder.create<scf::YieldOp>(loc, operands); 1456 builder.setInsertionPointAfter(whileOp); 1457 } 1458 1459 /// Generates the induction structure for a for-loop. 1460 static void genForInduction(Merger &merger, CodeGen &codegen, 1461 OpBuilder &builder, linalg::GenericOp op, 1462 Operation *loop) { 1463 Location loc = op.getLoc(); 1464 unsigned o = 0; 1465 SmallVector<Value, 4> operands; 1466 if (codegen.redVal) { 1467 operands.push_back(codegen.redVal); 1468 updateReduc(merger, codegen, loop->getResult(o++)); 1469 } 1470 if (codegen.expValues) { 1471 operands.push_back(codegen.expCount); 1472 codegen.expCount = loop->getResult(o++); 1473 } 1474 assert(o == operands.size()); 1475 if (o > 0) 1476 builder.create<scf::YieldOp>(loc, operands); 1477 builder.setInsertionPointAfter(loop); 1478 } 1479 1480 /// Generates a single if-statement within a while-loop. 1481 static scf::IfOp genIf(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1482 linalg::GenericOp op, unsigned idx, 1483 BitVector &conditions) { 1484 Location loc = op.getLoc(); 1485 SmallVector<Type, 4> types; 1486 Value cond; 1487 for (unsigned b = 0, be = conditions.size(); b < be; b++) { 1488 if (conditions[b]) { 1489 unsigned tensor = merger.tensor(b); 1490 assert(idx == merger.index(b)); 1491 Value clause; 1492 if (merger.isDim(b, Dim::kSparse)) { 1493 Value op1 = codegen.idxs[tensor][idx]; 1494 Value op2 = codegen.loops[idx]; 1495 clause = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, 1496 op1, op2); 1497 } else { 1498 clause = constantI1(builder, loc, true); 1499 } 1500 cond = cond ? builder.create<arith::AndIOp>(loc, cond, clause) : clause; 1501 } 1502 } 1503 if (codegen.redVal) 1504 types.push_back(codegen.redVal.getType()); 1505 if (codegen.expValues) 1506 types.push_back(builder.getIndexType()); 1507 scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, cond, /*else=*/true); 1508 builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); 1509 return ifOp; 1510 } 1511 1512 /// Generates end of true branch of if-statement within a while-loop. 1513 static void endIf(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1514 linalg::GenericOp op, scf::IfOp ifOp, Operation *loop, 1515 Value redInput, Value cntInput) { 1516 SmallVector<Value, 4> operands; 1517 if (codegen.redVal) { 1518 operands.push_back(codegen.redVal); 1519 updateReduc(merger, codegen, redInput); 1520 } 1521 if (codegen.expValues) { 1522 operands.push_back(codegen.expCount); 1523 codegen.expCount = cntInput; 1524 } 1525 if (!operands.empty()) 1526 builder.create<scf::YieldOp>(op.getLoc(), operands); 1527 builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); 1528 } 1529 1530 //===----------------------------------------------------------------------===// 1531 // Sparse compiler synthesis methods (loop sequence). 1532 //===----------------------------------------------------------------------===// 1533 1534 /// Starts a loop sequence at given level. Returns true if 1535 /// the universal loop index must be maintained at this level. 1536 static bool startLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1537 linalg::GenericOp op, std::vector<unsigned> &topSort, 1538 unsigned exp, unsigned at, unsigned idx, unsigned ldx, 1539 unsigned lts) { 1540 assert(codegen.curVecLength == 1); 1541 assert(!codegen.loops[idx]); 1542 // Emit invariants at this loop sequence level. 1543 genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/true); 1544 // Emit access pattern expansion for sparse tensor output. 1545 genExpansion(merger, codegen, builder, op, at, /*atStart=*/true); 1546 // Emit further intitialization at this loop sequence level. 1547 unsigned l0 = merger.set(lts)[0]; 1548 bool needsUniv = 1549 genInit(merger, codegen, builder, op, topSort, at, merger.lat(l0).bits); 1550 // Maintain the universal index only if it is actually 1551 // consumed by a subsequent lattice point. 1552 if (needsUniv) { 1553 unsigned lsize = merger.set(lts).size(); 1554 for (unsigned i = 1; i < lsize; i++) { 1555 unsigned li = merger.set(lts)[i]; 1556 if (!merger.hasAnyDimOf(merger.lat(li).simple, Dim::kSparse)) 1557 return true; 1558 } 1559 } 1560 return false; 1561 } 1562 1563 /// Starts a single loop in current sequence. 1564 static Operation *startLoop(Merger &merger, CodeGen &codegen, 1565 OpBuilder &builder, linalg::GenericOp op, 1566 std::vector<unsigned> &topSort, unsigned at, 1567 unsigned li, bool needsUniv) { 1568 assert(codegen.curVecLength == 1); 1569 // Emit the for/while-loop control. 1570 Operation *loop = genLoop(merger, codegen, builder, op, topSort, at, 1571 needsUniv, merger.lat(li).simple); 1572 // Emit the locals for this loop. 1573 genLocals(merger, codegen, builder, op, topSort, at, needsUniv, 1574 merger.lat(li).bits); 1575 return loop; 1576 } 1577 1578 /// Ends a single loop in current sequence. Returns new values for needsUniv. 1579 static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1580 linalg::GenericOp op, Operation *loop, unsigned idx, 1581 unsigned li, bool needsUniv) { 1582 codegen.curVecLength = 1; 1583 // End a while-loop. 1584 if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) { 1585 genWhileInduction(merger, codegen, builder, op, idx, needsUniv, 1586 merger.lat(li).bits, whileOp); 1587 return needsUniv; 1588 } 1589 // End a for-loop. 1590 genForInduction(merger, codegen, builder, op, loop); 1591 return false; 1592 } 1593 1594 /// Ends a loop sequence at given level. 1595 static void endLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder, 1596 linalg::GenericOp op, unsigned exp, unsigned at, 1597 unsigned idx, unsigned ldx) { 1598 assert(codegen.curVecLength == 1); 1599 codegen.loops[idx] = Value(); 1600 // Bring a pending reduction back from SIMD form when sequence ends. 1601 if (codegen.redVal) 1602 if (auto vtp = codegen.redVal.getType().dyn_cast<VectorType>()) 1603 updateReduc(merger, codegen, 1604 genVectorReducEnd(codegen, builder, op.getLoc(), vtp)); 1605 // Unmark bookkeeping of invariants and loop index. 1606 genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/false); 1607 // Finalize access pattern expansion for sparse tensor output. 1608 genExpansion(merger, codegen, builder, op, at, /*atStart=*/false); 1609 } 1610 1611 /// Recursively generates code while computing iteration lattices in order 1612 /// to manage the complexity of implementing co-iteration over unions 1613 /// and intersections of sparse iterations spaces. 1614 static void genStmt(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, 1615 linalg::GenericOp op, std::vector<unsigned> &topSort, 1616 unsigned exp, unsigned at) { 1617 // At each leaf, assign remaining tensor (sub)expression to output tensor. 1618 if (at == topSort.size()) { 1619 unsigned ldx = topSort[at - 1]; 1620 Value rhs = genExp(merger, codegen, rewriter, op, exp, ldx); 1621 genTensorStore(merger, codegen, rewriter, op, exp, rhs); 1622 return; 1623 } 1624 1625 // Construct iteration lattices for current loop index, with L0 at top. 1626 unsigned idx = topSort[at]; 1627 unsigned ldx = at == 0 ? -1u : topSort[at - 1]; 1628 unsigned lts = merger.optimizeSet(merger.buildLattices(exp, idx)); 1629 1630 // Start a loop sequence. 1631 bool needsUniv = startLoopSeq(merger, codegen, rewriter, op, topSort, exp, at, 1632 idx, ldx, lts); 1633 1634 // Emit a loop for every lattice point L0 >= Li in this loop sequence. 1635 unsigned lsize = merger.set(lts).size(); 1636 for (unsigned i = 0; i < lsize; i++) { 1637 // Start a loop. 1638 unsigned li = merger.set(lts)[i]; 1639 Operation *loop = 1640 startLoop(merger, codegen, rewriter, op, topSort, at, li, needsUniv); 1641 1642 // Visit all lattices points with Li >= Lj to generate the 1643 // loop-body, possibly with if statements for coiteration. 1644 Value redInput = codegen.redVal; 1645 Value cntInput = codegen.expCount; 1646 bool isWhile = dyn_cast<scf::WhileOp>(loop) != nullptr; 1647 for (unsigned j = 0; j < lsize; j++) { 1648 unsigned lj = merger.set(lts)[j]; 1649 unsigned ej = merger.lat(lj).exp; 1650 if (li == lj || merger.latGT(li, lj)) { 1651 // Recurse into body of each branch. 1652 if (isWhile) { 1653 scf::IfOp ifOp = 1654 genIf(merger, codegen, rewriter, op, idx, merger.lat(lj).simple); 1655 genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1); 1656 endIf(merger, codegen, rewriter, op, ifOp, loop, redInput, cntInput); 1657 } else { 1658 genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1); 1659 } 1660 } 1661 } 1662 1663 // End a loop. 1664 needsUniv = 1665 endLoop(merger, codegen, rewriter, op, loop, idx, li, needsUniv); 1666 } 1667 1668 // End a loop sequence. 1669 endLoopSeq(merger, codegen, rewriter, op, exp, at, idx, ldx); 1670 } 1671 1672 /// Converts the result computed by the sparse kernel into the required form. 1673 static void genResult(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, 1674 linalg::GenericOp op) { 1675 OpOperand *lhs = op.getOutputOperand(0); 1676 Type resType = lhs->get().getType(); 1677 if (getSparseTensorEncoding(resType)) { 1678 // The sparse tensor rematerializes from the original sparse tensor's 1679 // underlying sparse storage format. 1680 rewriter.replaceOpWithNewOp<LoadOp>(op, resType, lhs->get(), 1681 codegen.sparseOut == lhs); 1682 } else { 1683 // To rematerialize an non-annotated tensor, simply load it 1684 // from the bufferized value. 1685 Value val = codegen.buffers.back(); // value array 1686 rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, resType, val); 1687 } 1688 } 1689 1690 //===----------------------------------------------------------------------===// 1691 // Sparse compiler rewriting methods. 1692 //===----------------------------------------------------------------------===// 1693 1694 namespace { 1695 1696 /// Sparse rewriting rule for generic Lingalg operation. 1697 struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> { 1698 public: 1699 GenericOpSparsifier(MLIRContext *context, SparsificationOptions o) 1700 : OpRewritePattern<linalg::GenericOp>(context), options(o) {} 1701 1702 LogicalResult matchAndRewrite(linalg::GenericOp op, 1703 PatternRewriter &rewriter) const override { 1704 // Detects sparse annotations and translate the per-dimension sparsity 1705 // information for all tensors to loop indices in the kernel. 1706 assert(op.getNumOutputs() == 1); 1707 unsigned numTensors = op.getNumInputsAndOutputs(); 1708 unsigned numLoops = op.iterator_types().getValue().size(); 1709 Merger merger(numTensors, numLoops); 1710 if (!findSparseAnnotations(merger, op)) 1711 return failure(); 1712 1713 // Computes a topologically sorted iteration graph to ensure 1714 // tensors are visited in natural index order. Fails on cycles. 1715 // This assumes that higher-level passes have already put the 1716 // tensors in each tensor expression in a feasible order. 1717 std::vector<unsigned> topSort; 1718 if (!computeIterationGraph(merger, op, topSort, 1719 SortMask::kIncludeUndef | 1720 SortMask::kIncludeDense) && 1721 !computeIterationGraph(merger, op, topSort, SortMask::kIncludeUndef) && 1722 !computeIterationGraph(merger, op, topSort, SortMask::kIncludeDense) && 1723 !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly)) 1724 return failure(); 1725 1726 // Builds the tensor expression for the Linalg operation in SSA form. 1727 Optional<unsigned> optExp = merger.buildTensorExpFromLinalg(op); 1728 if (!optExp.hasValue()) 1729 return failure(); 1730 unsigned exp = optExp.getValue(); 1731 1732 // Rejects an inadmissable tensor expression. 1733 OpOperand *sparseOut = nullptr; 1734 unsigned outerParNest = 0; 1735 if (!isAdmissableTensorExp(merger, op, topSort, exp, &sparseOut, 1736 outerParNest)) 1737 return failure(); 1738 1739 // Recursively generates code. 1740 merger.setHasSparseOut(sparseOut != nullptr); 1741 CodeGen codegen(options, numTensors, numLoops, sparseOut, outerParNest); 1742 genBuffers(merger, codegen, rewriter, op); 1743 genStmt(merger, codegen, rewriter, op, topSort, exp, 0); 1744 genResult(merger, codegen, rewriter, op); 1745 return success(); 1746 } 1747 1748 private: 1749 /// Options to control sparse code generation. 1750 SparsificationOptions options; 1751 }; 1752 1753 } // namespace 1754 1755 /// Populates the given patterns list with rewriting rules required for 1756 /// the sparsification of linear algebra operations. 1757 void mlir::populateSparsificationPatterns( 1758 RewritePatternSet &patterns, const SparsificationOptions &options) { 1759 patterns.add<GenericOpSparsifier>(patterns.getContext(), options); 1760 } 1761