1 //===- LowerMatrixIntrinsics.cpp - Lower matrix intrinsics -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Lower matrix intrinsics to vector operations. 10 // 11 // TODO: 12 // * Improve fusion: 13 // * Support more cases, e.g. multiply-add, multiply-sub, operands/results 14 // transposed. 15 // * Improve cost-modeling, e.g. choose different number of rows/columns 16 // columns for tiles, consider cost of copies on alias. 17 // 18 //===----------------------------------------------------------------------===// 19 20 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 21 #include "llvm/ADT/GraphTraits.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/DomTreeUpdater.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/Analysis/VectorUtils.h" 30 #include "llvm/IR/CFG.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/DebugInfoMetadata.h" 33 #include "llvm/IR/Function.h" 34 #include "llvm/IR/IRBuilder.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/PatternMatch.h" 38 #include "llvm/InitializePasses.h" 39 #include "llvm/Pass.h" 40 #include "llvm/Support/Alignment.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/Debug.h" 43 #include "llvm/Transforms/Scalar.h" 44 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 45 #include "llvm/Transforms/Utils/LoopUtils.h" 46 #include "llvm/Transforms/Utils/MatrixUtils.h" 47 48 using namespace llvm; 49 using namespace PatternMatch; 50 51 #define DEBUG_TYPE "lower-matrix-intrinsics" 52 53 static cl::opt<bool> EnableShapePropagation( 54 "matrix-propagate-shape", cl::init(true), cl::Hidden, 55 cl::desc("Enable/disable shape propagation from matrix intrinsics to other " 56 "instructions.")); 57 58 static cl::opt<bool> 59 FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden, 60 cl::desc("Enable/disable fusing matrix instructions.")); 61 // TODO: Allow and use non-square tiles. 62 static cl::opt<unsigned> TileSize( 63 "fuse-matrix-tile-size", cl::init(4), cl::Hidden, 64 cl::desc( 65 "Tile size for matrix instruction fusion using square-shaped tiles.")); 66 static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false), 67 cl::Hidden, 68 cl::desc("Generate loop nest for tiling.")); 69 static cl::opt<bool> ForceFusion( 70 "force-fuse-matrix", cl::init(false), cl::Hidden, 71 cl::desc("Force matrix instruction fusion even if not profitable.")); 72 static cl::opt<bool> AllowContractEnabled( 73 "matrix-allow-contract", cl::init(false), cl::Hidden, 74 cl::desc("Allow the use of FMAs if available and profitable. This may " 75 "result in different results, due to less rounding error.")); 76 77 enum class MatrixLayoutTy { ColumnMajor, RowMajor }; 78 79 static cl::opt<MatrixLayoutTy> MatrixLayout( 80 "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor), 81 cl::desc("Sets the default matrix layout"), 82 cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major", 83 "Use column-major layout"), 84 clEnumValN(MatrixLayoutTy::RowMajor, "row-major", 85 "Use row-major layout"))); 86 87 /// Helper function to either return Scope, if it is a subprogram or the 88 /// attached subprogram for a local scope. 89 static DISubprogram *getSubprogram(DIScope *Scope) { 90 if (auto *Subprogram = dyn_cast<DISubprogram>(Scope)) 91 return Subprogram; 92 return cast<DILocalScope>(Scope)->getSubprogram(); 93 } 94 95 namespace { 96 97 // Given an element pointer \p BasePtr to the start of a (sub) matrix, compute 98 // the start address of vector \p VecIdx with type (\p EltType x \p NumElements) 99 // assuming \p Stride elements between start two consecutive vectors. 100 // \p Stride must be >= \p NumElements. 101 // For column-major matrixes, the function computes the address of a column 102 // vectors and \p NumElements must be set to the number of elements in a column 103 // (= number of rows of the matrix). For row-major matrixes, the function 104 // computes the address of a row vector and \p NumElements must be set to the 105 // number of elements in a column (= number of columns of the matrix). 106 // 107 // Consider a 4x4 matrix in column-mjaor layout like below 108 // 109 // 0 1 2 3 110 // 0 v_0_0 v_0_1 v_0_2 v_0_3 111 // 1 v_1_0 v_1_1 v_1_2 v_1_3 112 // 2 v_2_0 v_2_1 v_2_2 v_2_3 113 // 3 v_3_0 v_3_1 v_3_2 v_3_3 114 115 // To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1, 116 // we need a pointer to the first element of the submatrix as base pointer. 117 // Then we can use computeVectorAddr to compute the addresses for the columns 118 // of the sub-matrix. 119 // 120 // Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..) 121 // -> just returns Base 122 // Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..) 123 // -> returns Base + (1 * 4) 124 // Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..) 125 // -> returns Base + (2 * 4) 126 // 127 // The graphic below illustrates the number of elements in a column (marked 128 // with |) and the number of skipped elements (marked with }). 129 // 130 // v_0_0 v_0_1 {v_0_2 {v_0_3 131 // Base Col 1 Col 2 132 // | | | 133 // v_1_0 |v_1_1 |v_1_2 |v_1_3 134 // v_2_0 |v_2_1 |v_2_2 |v_2_3 135 // v_3_0 {v_3_1 {v_3_2 v_3_3 136 // 137 Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, 138 unsigned NumElements, Type *EltType, 139 IRBuilder<> &Builder) { 140 141 assert((!isa<ConstantInt>(Stride) || 142 cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) && 143 "Stride must be >= the number of elements in the result vector."); 144 unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); 145 146 // Compute the start of the vector with index VecIdx as VecIdx * Stride. 147 Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start"); 148 149 // Get pointer to the start of the selected vector. Skip GEP creation, 150 // if we select vector 0. 151 if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero()) 152 VecStart = BasePtr; 153 else 154 VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep"); 155 156 // Cast elementwise vector start pointer to a pointer to a vector 157 // (EltType x NumElements)*. 158 auto *VecType = FixedVectorType::get(EltType, NumElements); 159 Type *VecPtrType = PointerType::get(VecType, AS); 160 return Builder.CreatePointerCast(VecStart, VecPtrType, "vec.cast"); 161 } 162 163 /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. 164 /// 165 /// Currently, the lowering for each matrix intrinsic is done as follows: 166 /// 1. Propagate the shape information from intrinsics to connected 167 /// instructions. 168 /// 2. Lower instructions with shape information (assuming column-major layout). 169 /// The lowering works similarly using row-major layout. 170 /// 2.1. Get column vectors for each argument. If we already lowered the 171 /// definition of an argument, use the produced column vectors directly. 172 /// If not, split the operand vector containing an embedded matrix into 173 /// a set of column vectors, 174 /// 2.2. Lower the instruction in terms of column major operations, which 175 /// yields a set of column vectors containing result matrix. Note that we 176 /// lower all instructions that have shape information. Besides the 177 /// intrinsics, this includes stores for example. 178 /// 2.3. Update uses of the lowered instruction. If we have shape information 179 /// for a user, there is nothing to do, as we will look up the result 180 /// column matrix when lowering the user. For other uses, we embed the 181 /// result matrix in a flat vector and update the use. 182 /// 2.4. Cache the result column matrix for the instruction we lowered 183 /// 3. After we lowered all instructions in a function, remove the now 184 /// obsolete instructions. 185 /// 186 class LowerMatrixIntrinsics { 187 Function &Func; 188 const DataLayout &DL; 189 const TargetTransformInfo &TTI; 190 AliasAnalysis *AA; 191 DominatorTree *DT; 192 LoopInfo *LI; 193 OptimizationRemarkEmitter *ORE; 194 195 /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. 196 struct OpInfoTy { 197 /// Number of stores emitted to generate this matrix. 198 unsigned NumStores = 0; 199 /// Number of loads emitted to generate this matrix. 200 unsigned NumLoads = 0; 201 /// Number of compute operations emitted to generate this matrix. 202 unsigned NumComputeOps = 0; 203 204 OpInfoTy &operator+=(const OpInfoTy &RHS) { 205 NumStores += RHS.NumStores; 206 NumLoads += RHS.NumLoads; 207 NumComputeOps += RHS.NumComputeOps; 208 return *this; 209 } 210 }; 211 212 /// Wrapper class representing a matrix as a set of vectors, either in row or 213 /// column major layout. All vectors must have the same vector type. 214 class MatrixTy { 215 SmallVector<Value *, 16> Vectors; 216 217 OpInfoTy OpInfo; 218 219 bool IsColumnMajor = true; 220 221 public: 222 MatrixTy() 223 : Vectors(), 224 IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} 225 MatrixTy(ArrayRef<Value *> Vectors) 226 : Vectors(Vectors.begin(), Vectors.end()), 227 IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} 228 MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy) 229 : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) { 230 231 unsigned D = isColumnMajor() ? NumColumns : NumRows; 232 for (unsigned J = 0; J < D; ++J) 233 addVector(UndefValue::get(FixedVectorType::get( 234 EltTy, isColumnMajor() ? NumRows : NumColumns))); 235 } 236 237 Value *getVector(unsigned i) const { return Vectors[i]; } 238 Value *getColumn(unsigned i) const { 239 assert(isColumnMajor() && "only supported for column-major matrixes"); 240 return Vectors[i]; 241 } 242 Value *getRow(unsigned i) const { 243 assert(!isColumnMajor() && "only supported for row-major matrixes"); 244 return Vectors[i]; 245 } 246 247 void setVector(unsigned i, Value *V) { Vectors[i] = V; } 248 249 Type *getElementType() const { return getVectorTy()->getElementType(); } 250 251 unsigned getNumVectors() const { 252 if (isColumnMajor()) 253 return getNumColumns(); 254 return getNumRows(); 255 } 256 257 unsigned getNumColumns() const { 258 if (isColumnMajor()) 259 return Vectors.size(); 260 else { 261 assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); 262 return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements(); 263 } 264 } 265 unsigned getNumRows() const { 266 if (isColumnMajor()) { 267 assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); 268 return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements(); 269 } else 270 return Vectors.size(); 271 } 272 273 void addVector(Value *V) { Vectors.push_back(V); } 274 VectorType *getColumnTy() { 275 assert(isColumnMajor() && "only supported for column-major matrixes"); 276 return getVectorTy(); 277 } 278 279 VectorType *getVectorTy() const { 280 return cast<VectorType>(Vectors[0]->getType()); 281 } 282 283 iterator_range<SmallVector<Value *, 8>::iterator> columns() { 284 assert(isColumnMajor() && 285 "columns() only supported for column-major matrixes"); 286 return make_range(Vectors.begin(), Vectors.end()); 287 } 288 289 iterator_range<SmallVector<Value *, 8>::iterator> vectors() { 290 return make_range(Vectors.begin(), Vectors.end()); 291 } 292 293 /// Embed the vectors of the matrix into a flat vector by concatenating 294 /// them. 295 Value *embedInVector(IRBuilder<> &Builder) const { 296 return Vectors.size() == 1 ? Vectors[0] 297 : concatenateVectors(Builder, Vectors); 298 } 299 300 MatrixTy &addNumLoads(unsigned N) { 301 OpInfo.NumLoads += N; 302 return *this; 303 } 304 305 void setNumLoads(unsigned N) { OpInfo.NumLoads = N; } 306 307 MatrixTy &addNumStores(unsigned N) { 308 OpInfo.NumStores += N; 309 return *this; 310 } 311 312 MatrixTy &addNumComputeOps(unsigned N) { 313 OpInfo.NumComputeOps += N; 314 return *this; 315 } 316 317 unsigned getNumStores() const { return OpInfo.NumStores; } 318 unsigned getNumLoads() const { return OpInfo.NumLoads; } 319 unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; } 320 321 const OpInfoTy &getOpInfo() const { return OpInfo; } 322 323 bool isColumnMajor() const { return IsColumnMajor; } 324 325 unsigned getStride() const { 326 if (isColumnMajor()) 327 return getNumRows(); 328 return getNumColumns(); 329 } 330 331 /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the 332 /// matrix is column-major, the result vector is extracted from a column 333 /// vector, otherwise from a row vector. 334 Value *extractVector(unsigned I, unsigned J, unsigned NumElts, 335 IRBuilder<> &Builder) const { 336 Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I); 337 return Builder.CreateShuffleVector( 338 Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0), 339 "block"); 340 } 341 }; 342 343 struct ShapeInfo { 344 unsigned NumRows; 345 unsigned NumColumns; 346 347 bool IsColumnMajor; 348 349 ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) 350 : NumRows(NumRows), NumColumns(NumColumns), 351 IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} 352 353 ShapeInfo(Value *NumRows, Value *NumColumns) 354 : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(), 355 cast<ConstantInt>(NumColumns)->getZExtValue()) {} 356 357 bool operator==(const ShapeInfo &other) { 358 return NumRows == other.NumRows && NumColumns == other.NumColumns; 359 } 360 bool operator!=(const ShapeInfo &other) { return !(*this == other); } 361 362 /// Returns true if shape-information is defined, meaning both dimensions 363 /// are != 0. 364 operator bool() const { 365 assert(NumRows == 0 || NumColumns != 0); 366 return NumRows != 0; 367 } 368 369 unsigned getStride() const { 370 if (IsColumnMajor) 371 return NumRows; 372 return NumColumns; 373 } 374 375 unsigned getNumVectors() const { 376 if (IsColumnMajor) 377 return NumColumns; 378 return NumRows; 379 } 380 }; 381 382 /// Maps instructions to their shape information. The shape information 383 /// describes the shape to be used while lowering. This matches the shape of 384 /// the result value of the instruction, with the only exceptions being store 385 /// instructions and the matrix_column_major_store intrinsics. For those, the 386 /// shape information indicates that those instructions should be lowered 387 /// using shape information as well. 388 DenseMap<Value *, ShapeInfo> ShapeMap; 389 390 /// List of instructions to remove. While lowering, we are not replacing all 391 /// users of a lowered instruction, if shape information is available and 392 /// those need to be removed after we finished lowering. 393 SmallVector<Instruction *, 16> ToRemove; 394 395 /// Map from instructions to their produced column matrix. 396 MapVector<Value *, MatrixTy> Inst2ColumnMatrix; 397 398 public: 399 LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI, 400 AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, 401 OptimizationRemarkEmitter *ORE) 402 : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT), 403 LI(LI), ORE(ORE) {} 404 405 unsigned getNumOps(Type *VT) { 406 assert(isa<VectorType>(VT) && "Expected vector type"); 407 return getNumOps(VT->getScalarType(), 408 cast<FixedVectorType>(VT)->getNumElements()); 409 } 410 411 // 412 /// Return the estimated number of vector ops required for an operation on 413 /// \p VT * N. 414 unsigned getNumOps(Type *ST, unsigned N) { 415 return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedSize() / 416 double(TTI.getRegisterBitWidth( 417 TargetTransformInfo::RGK_FixedWidthVector) 418 .getFixedSize())); 419 } 420 421 /// Return the set of vectors that a matrix value is lowered to. 422 /// 423 /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise 424 /// split the flat vector \p MatrixVal containing a matrix with shape \p SI 425 /// into vectors. 426 MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, 427 IRBuilder<> &Builder) { 428 VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType()); 429 assert(VType && "MatrixVal must be a vector type"); 430 assert(cast<FixedVectorType>(VType)->getNumElements() == 431 SI.NumRows * SI.NumColumns && 432 "The vector size must match the number of matrix elements"); 433 434 // Check if we lowered MatrixVal using shape information. In that case, 435 // return the existing matrix, if it matches the requested shape 436 // information. If there is a mis-match, embed the result in a flat 437 // vector and split it later. 438 auto Found = Inst2ColumnMatrix.find(MatrixVal); 439 if (Found != Inst2ColumnMatrix.end()) { 440 MatrixTy &M = Found->second; 441 // Return the found matrix, if its shape matches the requested shape 442 // information 443 if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns()) 444 return M; 445 446 MatrixVal = M.embedInVector(Builder); 447 } 448 449 // Otherwise split MatrixVal. 450 SmallVector<Value *, 16> SplitVecs; 451 for (unsigned MaskStart = 0; 452 MaskStart < cast<FixedVectorType>(VType)->getNumElements(); 453 MaskStart += SI.getStride()) { 454 Value *V = Builder.CreateShuffleVector( 455 MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0), 456 "split"); 457 SplitVecs.push_back(V); 458 } 459 460 return {SplitVecs}; 461 } 462 463 /// If \p V already has a known shape return false. Otherwise set the shape 464 /// for instructions that support it. 465 bool setShapeInfo(Value *V, ShapeInfo Shape) { 466 assert(Shape && "Shape not set"); 467 if (isa<UndefValue>(V) || !supportsShapeInfo(V)) 468 return false; 469 470 auto SIter = ShapeMap.find(V); 471 if (SIter != ShapeMap.end()) { 472 LLVM_DEBUG(dbgs() << " not overriding existing shape: " 473 << SIter->second.NumRows << " " 474 << SIter->second.NumColumns << " for " << *V << "\n"); 475 return false; 476 } 477 478 ShapeMap.insert({V, Shape}); 479 LLVM_DEBUG(dbgs() << " " << Shape.NumRows << " x " << Shape.NumColumns 480 << " for " << *V << "\n"); 481 return true; 482 } 483 484 bool isUniformShape(Value *V) { 485 Instruction *I = dyn_cast<Instruction>(V); 486 if (!I) 487 return true; 488 489 switch (I->getOpcode()) { 490 case Instruction::FAdd: 491 case Instruction::FSub: 492 case Instruction::FMul: // Scalar multiply. 493 case Instruction::FNeg: 494 case Instruction::Add: 495 case Instruction::Mul: 496 case Instruction::Sub: 497 return true; 498 default: 499 return false; 500 } 501 } 502 503 /// Returns true if shape information can be used for \p V. The supported 504 /// instructions must match the instructions that can be lowered by this pass. 505 bool supportsShapeInfo(Value *V) { 506 Instruction *Inst = dyn_cast<Instruction>(V); 507 if (!Inst) 508 return false; 509 510 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); 511 if (II) 512 switch (II->getIntrinsicID()) { 513 case Intrinsic::matrix_multiply: 514 case Intrinsic::matrix_transpose: 515 case Intrinsic::matrix_column_major_load: 516 case Intrinsic::matrix_column_major_store: 517 return true; 518 default: 519 return false; 520 } 521 return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V); 522 } 523 524 /// Propagate the shape information of instructions to their users. 525 /// The work list contains instructions for which we can compute the shape, 526 /// either based on the information provided by matrix intrinsics or known 527 /// shapes of operands. 528 SmallVector<Instruction *, 32> 529 propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) { 530 SmallVector<Instruction *, 32> NewWorkList; 531 // Pop an element for which we guaranteed to have at least one of the 532 // operand shapes. Add the shape for this and then add users to the work 533 // list. 534 LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n"); 535 while (!WorkList.empty()) { 536 Instruction *Inst = WorkList.pop_back_val(); 537 538 // New entry, set the value and insert operands 539 bool Propagate = false; 540 541 Value *MatrixA; 542 Value *MatrixB; 543 Value *M; 544 Value *N; 545 Value *K; 546 if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>( 547 m_Value(MatrixA), m_Value(MatrixB), m_Value(M), 548 m_Value(N), m_Value(K)))) { 549 Propagate = setShapeInfo(Inst, {M, K}); 550 } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>( 551 m_Value(MatrixA), m_Value(M), m_Value(N)))) { 552 // Flip dimensions. 553 Propagate = setShapeInfo(Inst, {N, M}); 554 } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>( 555 m_Value(MatrixA), m_Value(), m_Value(), 556 m_Value(), m_Value(M), m_Value(N)))) { 557 Propagate = setShapeInfo(Inst, {N, M}); 558 } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>( 559 m_Value(), m_Value(), m_Value(), m_Value(M), 560 m_Value(N)))) { 561 Propagate = setShapeInfo(Inst, {M, N}); 562 } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) { 563 auto OpShape = ShapeMap.find(MatrixA); 564 if (OpShape != ShapeMap.end()) 565 setShapeInfo(Inst, OpShape->second); 566 continue; 567 } else if (isUniformShape(Inst)) { 568 // Find the first operand that has a known shape and use that. 569 for (auto &Op : Inst->operands()) { 570 auto OpShape = ShapeMap.find(Op.get()); 571 if (OpShape != ShapeMap.end()) { 572 Propagate |= setShapeInfo(Inst, OpShape->second); 573 break; 574 } 575 } 576 } 577 578 if (Propagate) { 579 NewWorkList.push_back(Inst); 580 for (auto *User : Inst->users()) 581 if (ShapeMap.count(User) == 0) 582 WorkList.push_back(cast<Instruction>(User)); 583 } 584 } 585 586 return NewWorkList; 587 } 588 589 /// Propagate the shape to operands of instructions with shape information. 590 /// \p Worklist contains the instruction for which we already know the shape. 591 SmallVector<Instruction *, 32> 592 propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) { 593 SmallVector<Instruction *, 32> NewWorkList; 594 595 auto pushInstruction = [](Value *V, 596 SmallVectorImpl<Instruction *> &WorkList) { 597 Instruction *I = dyn_cast<Instruction>(V); 598 if (I) 599 WorkList.push_back(I); 600 }; 601 // Pop an element with known shape. Traverse the operands, if their shape 602 // derives from the result shape and is unknown, add it and add them to the 603 // worklist. 604 LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n"); 605 while (!WorkList.empty()) { 606 Value *V = WorkList.pop_back_val(); 607 608 size_t BeforeProcessingV = WorkList.size(); 609 if (!isa<Instruction>(V)) 610 continue; 611 612 Value *MatrixA; 613 Value *MatrixB; 614 Value *M; 615 Value *N; 616 Value *K; 617 if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>( 618 m_Value(MatrixA), m_Value(MatrixB), m_Value(M), 619 m_Value(N), m_Value(K)))) { 620 if (setShapeInfo(MatrixA, {M, N})) 621 pushInstruction(MatrixA, WorkList); 622 623 if (setShapeInfo(MatrixB, {N, K})) 624 pushInstruction(MatrixB, WorkList); 625 626 } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>( 627 m_Value(MatrixA), m_Value(M), m_Value(N)))) { 628 // Flip dimensions. 629 if (setShapeInfo(MatrixA, {M, N})) 630 pushInstruction(MatrixA, WorkList); 631 } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>( 632 m_Value(MatrixA), m_Value(), m_Value(), m_Value(), 633 m_Value(M), m_Value(N)))) { 634 if (setShapeInfo(MatrixA, {M, N})) { 635 pushInstruction(MatrixA, WorkList); 636 } 637 } else if (isa<LoadInst>(V) || 638 match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) { 639 // Nothing to do, no matrix input. 640 } else if (isa<StoreInst>(V)) { 641 // Nothing to do. We forward-propagated to this so we would just 642 // backward propagate to an instruction with an already known shape. 643 } else if (isUniformShape(V)) { 644 // Propagate to all operands. 645 ShapeInfo Shape = ShapeMap[V]; 646 for (Use &U : cast<Instruction>(V)->operands()) { 647 if (setShapeInfo(U.get(), Shape)) 648 pushInstruction(U.get(), WorkList); 649 } 650 } 651 // After we discovered new shape info for new instructions in the 652 // worklist, we use their users as seeds for the next round of forward 653 // propagation. 654 for (size_t I = BeforeProcessingV; I != WorkList.size(); I++) 655 for (User *U : WorkList[I]->users()) 656 if (isa<Instruction>(U) && V != U) 657 NewWorkList.push_back(cast<Instruction>(U)); 658 } 659 return NewWorkList; 660 } 661 662 bool Visit() { 663 if (EnableShapePropagation) { 664 SmallVector<Instruction *, 32> WorkList; 665 666 // Initially only the shape of matrix intrinsics is known. 667 // Initialize the work list with ops carrying shape information. 668 for (BasicBlock &BB : Func) 669 for (Instruction &Inst : BB) { 670 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst); 671 if (!II) 672 continue; 673 674 switch (II->getIntrinsicID()) { 675 case Intrinsic::matrix_multiply: 676 case Intrinsic::matrix_transpose: 677 case Intrinsic::matrix_column_major_load: 678 case Intrinsic::matrix_column_major_store: 679 WorkList.push_back(&Inst); 680 break; 681 default: 682 break; 683 } 684 } 685 // Propagate shapes until nothing changes any longer. 686 while (!WorkList.empty()) { 687 WorkList = propagateShapeForward(WorkList); 688 WorkList = propagateShapeBackward(WorkList); 689 } 690 } 691 692 bool Changed = false; 693 SmallVector<CallInst *, 16> MaybeFusableInsts; 694 SmallVector<Instruction *, 16> MatrixInsts; 695 696 // First, collect all instructions with shape information and candidates for 697 // fusion (currently only matrix multiplies). 698 ReversePostOrderTraversal<Function *> RPOT(&Func); 699 for (auto *BB : RPOT) 700 for (Instruction &I : *BB) { 701 if (ShapeMap.find(&I) == ShapeMap.end()) 702 continue; 703 if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>())) 704 MaybeFusableInsts.push_back(cast<CallInst>(&I)); 705 MatrixInsts.push_back(&I); 706 } 707 708 // Second, try to fuse candidates. 709 SmallPtrSet<Instruction *, 16> FusedInsts; 710 for (CallInst *CI : MaybeFusableInsts) 711 LowerMatrixMultiplyFused(CI, FusedInsts); 712 Changed = !FusedInsts.empty(); 713 714 // Third, lower remaining instructions with shape information. 715 for (Instruction *Inst : MatrixInsts) { 716 if (FusedInsts.count(Inst)) 717 continue; 718 719 IRBuilder<> Builder(Inst); 720 721 if (CallInst *CInst = dyn_cast<CallInst>(Inst)) 722 Changed |= VisitCallInst(CInst); 723 724 Value *Op1; 725 Value *Op2; 726 if (auto *BinOp = dyn_cast<BinaryOperator>(Inst)) 727 Changed |= VisitBinaryOperator(BinOp); 728 if (auto *UnOp = dyn_cast<UnaryOperator>(Inst)) 729 Changed |= VisitUnaryOperator(UnOp); 730 if (match(Inst, m_Load(m_Value(Op1)))) 731 Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder); 732 else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2)))) 733 Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder); 734 } 735 736 if (ORE) { 737 RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func); 738 RemarkGen.emitRemarks(); 739 } 740 741 // Delete the instructions backwards, as it has a reduced likelihood of 742 // having to update as many def-use and use-def chains. 743 for (auto *Inst : reverse(ToRemove)) { 744 if (!Inst->use_empty()) 745 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 746 Inst->eraseFromParent(); 747 } 748 749 return Changed; 750 } 751 752 /// Turns \p BasePtr into an elementwise pointer to \p EltType. 753 Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) { 754 unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); 755 Type *EltPtrType = PointerType::get(EltType, AS); 756 return Builder.CreatePointerCast(BasePtr, EltPtrType); 757 } 758 759 /// Replace intrinsic calls 760 bool VisitCallInst(CallInst *Inst) { 761 if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic()) 762 return false; 763 764 switch (Inst->getCalledFunction()->getIntrinsicID()) { 765 case Intrinsic::matrix_multiply: 766 LowerMultiply(Inst); 767 break; 768 case Intrinsic::matrix_transpose: 769 LowerTranspose(Inst); 770 break; 771 case Intrinsic::matrix_column_major_load: 772 LowerColumnMajorLoad(Inst); 773 break; 774 case Intrinsic::matrix_column_major_store: 775 LowerColumnMajorStore(Inst); 776 break; 777 default: 778 return false; 779 } 780 return true; 781 } 782 783 /// Compute the alignment for a column/row \p Idx with \p Stride between them. 784 /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a 785 /// ConstantInt, reduce the initial alignment based on the byte offset. For 786 /// non-ConstantInt strides, return the common alignment of the initial 787 /// alignment and the element size in bytes. 788 Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy, 789 MaybeAlign A) const { 790 Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy); 791 if (Idx == 0) 792 return InitialAlign; 793 794 TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy); 795 if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) { 796 uint64_t StrideInBytes = 797 ConstStride->getZExtValue() * ElementSizeInBits / 8; 798 return commonAlignment(InitialAlign, Idx * StrideInBytes); 799 } 800 return commonAlignment(InitialAlign, ElementSizeInBits / 8); 801 } 802 803 /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between 804 /// vectors. 805 MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride, 806 bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) { 807 auto *VType = cast<VectorType>(Ty); 808 Type *EltTy = VType->getElementType(); 809 Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride()); 810 Value *EltPtr = createElementPtr(Ptr, EltTy, Builder); 811 MatrixTy Result; 812 for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) { 813 Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride, 814 Shape.getStride(), EltTy, Builder); 815 Value *Vector = Builder.CreateAlignedLoad( 816 VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign), 817 IsVolatile, "col.load"); 818 819 Result.addVector(Vector); 820 } 821 return Result.addNumLoads(getNumOps(Result.getVectorTy()) * 822 Result.getNumVectors()); 823 } 824 825 /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix, 826 /// starting at \p MatrixPtr[I][J]. 827 MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile, 828 ShapeInfo MatrixShape, Value *I, Value *J, 829 ShapeInfo ResultShape, Type *EltTy, 830 IRBuilder<> &Builder) { 831 832 Value *Offset = Builder.CreateAdd( 833 Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I); 834 835 unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace(); 836 Value *EltPtr = 837 Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); 838 Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); 839 auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows * 840 ResultShape.NumColumns); 841 Type *TilePtrTy = PointerType::get(TileTy, AS); 842 Value *TilePtr = 843 Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); 844 845 return loadMatrix(TileTy, TilePtr, Align, 846 Builder.getInt64(MatrixShape.getStride()), IsVolatile, 847 ResultShape, Builder); 848 } 849 850 /// Lower a load instruction with shape information. 851 void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride, 852 bool IsVolatile, ShapeInfo Shape) { 853 IRBuilder<> Builder(Inst); 854 finalizeLowering(Inst, 855 loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, 856 Shape, Builder), 857 Builder); 858 } 859 860 /// Lowers llvm.matrix.column.major.load. 861 /// 862 /// The intrinsic loads a matrix from memory using a stride between columns. 863 void LowerColumnMajorLoad(CallInst *Inst) { 864 assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && 865 "Intrinsic only supports column-major layout!"); 866 Value *Ptr = Inst->getArgOperand(0); 867 Value *Stride = Inst->getArgOperand(1); 868 LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride, 869 cast<ConstantInt>(Inst->getArgOperand(2))->isOne(), 870 {Inst->getArgOperand(3), Inst->getArgOperand(4)}); 871 } 872 873 /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p 874 /// MatrixPtr[I][J]. 875 void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr, 876 MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape, 877 Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) { 878 Value *Offset = Builder.CreateAdd( 879 Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I); 880 881 unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace(); 882 Value *EltPtr = 883 Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); 884 Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); 885 auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() * 886 StoreVal.getNumColumns()); 887 Type *TilePtrTy = PointerType::get(TileTy, AS); 888 Value *TilePtr = 889 Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); 890 891 storeMatrix(TileTy, StoreVal, TilePtr, MAlign, 892 Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder); 893 } 894 895 /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between 896 /// vectors. 897 MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr, 898 MaybeAlign MAlign, Value *Stride, bool IsVolatile, 899 IRBuilder<> &Builder) { 900 auto VType = cast<VectorType>(Ty); 901 Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); 902 for (auto Vec : enumerate(StoreVal.vectors())) { 903 Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()), 904 Stride, StoreVal.getStride(), 905 VType->getElementType(), Builder); 906 Builder.CreateAlignedStore(Vec.value(), GEP, 907 getAlignForIndex(Vec.index(), Stride, 908 VType->getElementType(), 909 MAlign), 910 IsVolatile); 911 } 912 return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) * 913 StoreVal.getNumVectors()); 914 } 915 916 /// Lower a store instruction with shape information. 917 void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A, 918 Value *Stride, bool IsVolatile, ShapeInfo Shape) { 919 IRBuilder<> Builder(Inst); 920 auto StoreVal = getMatrix(Matrix, Shape, Builder); 921 finalizeLowering(Inst, 922 storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, 923 IsVolatile, Builder), 924 Builder); 925 } 926 927 /// Lowers llvm.matrix.column.major.store. 928 /// 929 /// The intrinsic store a matrix back memory using a stride between columns. 930 void LowerColumnMajorStore(CallInst *Inst) { 931 assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && 932 "Intrinsic only supports column-major layout!"); 933 Value *Matrix = Inst->getArgOperand(0); 934 Value *Ptr = Inst->getArgOperand(1); 935 Value *Stride = Inst->getArgOperand(2); 936 LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride, 937 cast<ConstantInt>(Inst->getArgOperand(3))->isOne(), 938 {Inst->getArgOperand(4), Inst->getArgOperand(5)}); 939 } 940 941 // Set elements I..I+NumElts-1 to Block 942 Value *insertVector(Value *Col, unsigned I, Value *Block, 943 IRBuilder<> &Builder) { 944 945 // First, bring Block to the same size as Col 946 unsigned BlockNumElts = 947 cast<FixedVectorType>(Block->getType())->getNumElements(); 948 unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements(); 949 assert(NumElts >= BlockNumElts && "Too few elements for current block"); 950 951 Block = Builder.CreateShuffleVector( 952 Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts)); 953 954 // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7, 955 // 8, 4, 5, 6 956 SmallVector<int, 16> Mask; 957 unsigned i; 958 for (i = 0; i < I; i++) 959 Mask.push_back(i); 960 961 unsigned VecNumElts = 962 cast<FixedVectorType>(Col->getType())->getNumElements(); 963 for (; i < I + BlockNumElts; i++) 964 Mask.push_back(i - I + VecNumElts); 965 966 for (; i < VecNumElts; i++) 967 Mask.push_back(i); 968 969 return Builder.CreateShuffleVector(Col, Block, Mask); 970 } 971 972 Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, 973 IRBuilder<> &Builder, bool AllowContraction, 974 unsigned &NumComputeOps) { 975 NumComputeOps += getNumOps(A->getType()); 976 if (!Sum) 977 return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); 978 979 if (UseFPOp) { 980 if (AllowContraction) { 981 // Use fmuladd for floating point operations and let the backend decide 982 // if that's profitable. 983 Function *FMulAdd = Intrinsic::getDeclaration( 984 Func.getParent(), Intrinsic::fmuladd, A->getType()); 985 return Builder.CreateCall(FMulAdd, {A, B, Sum}); 986 } 987 NumComputeOps += getNumOps(A->getType()); 988 Value *Mul = Builder.CreateFMul(A, B); 989 return Builder.CreateFAdd(Sum, Mul); 990 } 991 992 NumComputeOps += getNumOps(A->getType()); 993 Value *Mul = Builder.CreateMul(A, B); 994 return Builder.CreateAdd(Sum, Mul); 995 } 996 997 /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For 998 /// users with shape information, there's nothing to do: they will use the 999 /// cached value when they are lowered. For other users, \p Matrix is 1000 /// flattened and the uses are updated to use it. Also marks \p Inst for 1001 /// deletion. 1002 void finalizeLowering(Instruction *Inst, MatrixTy Matrix, 1003 IRBuilder<> &Builder) { 1004 Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix)); 1005 1006 ToRemove.push_back(Inst); 1007 Value *Flattened = nullptr; 1008 for (Use &U : llvm::make_early_inc_range(Inst->uses())) { 1009 if (ShapeMap.find(U.getUser()) == ShapeMap.end()) { 1010 if (!Flattened) 1011 Flattened = Matrix.embedInVector(Builder); 1012 U.set(Flattened); 1013 } 1014 } 1015 } 1016 1017 /// Compute \p Result += \p A * \p B for input matrices with left-associating 1018 /// addition. 1019 /// 1020 /// We can fold a transpose into the operand that is used to extract scalars. 1021 /// This is the first operands with row-major and the second with 1022 /// column-major. If \p IsScalarMatrixTransposed we assume the appropriate 1023 /// operand is transposed. 1024 void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A, 1025 const MatrixTy &B, bool AllowContraction, 1026 IRBuilder<> &Builder, bool IsTiled, 1027 bool IsScalarMatrixTransposed) { 1028 const unsigned VF = std::max<unsigned>( 1029 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 1030 .getFixedSize() / 1031 Result.getElementType()->getPrimitiveSizeInBits().getFixedSize(), 1032 1U); 1033 unsigned R = Result.getNumRows(); 1034 unsigned C = Result.getNumColumns(); 1035 unsigned M = A.getNumColumns(); 1036 1037 bool IsFP = Result.getElementType()->isFloatingPointTy(); 1038 assert(A.isColumnMajor() == B.isColumnMajor() && 1039 Result.isColumnMajor() == A.isColumnMajor() && 1040 "operands must agree on matrix layout"); 1041 unsigned NumComputeOps = 0; 1042 if (A.isColumnMajor()) { 1043 // Multiply columns from the first operand with scalars from the second 1044 // operand. Then move along the K axes and accumulate the columns. With 1045 // this the adds can be vectorized without reassociation. 1046 for (unsigned J = 0; J < C; ++J) { 1047 unsigned BlockSize = VF; 1048 // If Result is zero, we don't need to accumulate in the K==0 iteration. 1049 bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J)); 1050 1051 for (unsigned I = 0; I < R; I += BlockSize) { 1052 // Gradually lower the vectorization factor to cover the remainder. 1053 while (I + BlockSize > R) 1054 BlockSize /= 2; 1055 1056 Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder) 1057 : nullptr; 1058 for (unsigned K = 0; K < M; ++K) { 1059 Value *L = A.extractVector(I, K, BlockSize, Builder); 1060 Value *RH = Builder.CreateExtractElement( 1061 B.getColumn(IsScalarMatrixTransposed ? K : J), 1062 IsScalarMatrixTransposed ? J : K); 1063 Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); 1064 Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat, 1065 Result.getElementType()->isFloatingPointTy(), 1066 Builder, AllowContraction, NumComputeOps); 1067 } 1068 Result.setVector(J, 1069 insertVector(Result.getVector(J), I, Sum, Builder)); 1070 } 1071 } 1072 } else { 1073 // Multiply rows from the second operand with scalars from the first 1074 // operand. Then move along the K axes and accumulate the rows. With this 1075 // the adds can be vectorized without reassociation. 1076 for (unsigned I = 0; I < R; ++I) { 1077 unsigned BlockSize = VF; 1078 bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I)); 1079 for (unsigned J = 0; J < C; J += BlockSize) { 1080 // Gradually lower the vectorization factor to cover the remainder. 1081 while (J + BlockSize > C) 1082 BlockSize /= 2; 1083 1084 Value *Sum = nullptr; 1085 for (unsigned K = 0; K < M; ++K) { 1086 Value *R = B.extractVector(K, J, BlockSize, Builder); 1087 Value *LH = Builder.CreateExtractElement( 1088 A.getVector(IsScalarMatrixTransposed ? K : I), 1089 IsScalarMatrixTransposed ? I : K); 1090 Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat"); 1091 Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R, 1092 IsFP, Builder, AllowContraction, NumComputeOps); 1093 } 1094 Result.setVector(I, 1095 insertVector(Result.getVector(I), J, Sum, Builder)); 1096 } 1097 } 1098 } 1099 Result.addNumComputeOps(NumComputeOps); 1100 } 1101 1102 /// Ensure that the memory in \p Load does not alias \p Store by potentially 1103 /// copying it to a new location. This new or otherwise the original location 1104 /// is returned. 1105 Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store, 1106 CallInst *MatMul) { 1107 MemoryLocation StoreLoc = MemoryLocation::get(Store); 1108 MemoryLocation LoadLoc = MemoryLocation::get(Load); 1109 1110 // If we can statically determine noalias we're good. 1111 if (AA->isNoAlias(LoadLoc, StoreLoc)) 1112 return Load->getPointerOperand(); 1113 1114 // Create code to check if the memory locations of the Load and Store 1115 // overlap and if they do, copy Load's operand to a new buffer. 1116 1117 // First, create new blocks for 2n part of the check and the copy. 1118 BasicBlock *Check0 = MatMul->getParent(); 1119 // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a 1120 // DT. Manually collect dominator tree updates, to avoid unnecessary work, 1121 // as we adjust Check0 and Check1's branches. 1122 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 1123 for (BasicBlock *Succ : successors(Check0)) 1124 DTUpdates.push_back({DT->Delete, Check0, Succ}); 1125 1126 BasicBlock *Check1 = 1127 SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI, 1128 nullptr, "alias_cont"); 1129 BasicBlock *Copy = 1130 SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI, 1131 nullptr, "copy"); 1132 BasicBlock *Fusion = 1133 SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI, 1134 nullptr, "no_alias"); 1135 1136 // Check if the loaded memory location begins before the end of the store 1137 // location. If the condition holds, they might overlap, otherwise they are 1138 // guaranteed to not overlap. 1139 IRBuilder<> Builder(MatMul); 1140 Check0->getTerminator()->eraseFromParent(); 1141 Builder.SetInsertPoint(Check0); 1142 Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout()); 1143 Value *StoreBegin = Builder.CreatePtrToInt( 1144 const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin"); 1145 Value *StoreEnd = Builder.CreateAdd( 1146 StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()), 1147 "store.end", true, true); 1148 Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr), 1149 IntPtrTy, "load.begin"); 1150 Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1, 1151 Fusion); 1152 1153 // Check if the store begins before the end of the load location. If the 1154 // condition holds, they alias, otherwise they are guaranteed to not 1155 // overlap. 1156 Check1->getTerminator()->eraseFromParent(); 1157 Builder.SetInsertPoint(Check1, Check1->begin()); 1158 Value *LoadEnd = Builder.CreateAdd( 1159 LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()), 1160 "load.end", true, true); 1161 Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy, 1162 Fusion); 1163 1164 // Copy load operand to new alloca. 1165 Builder.SetInsertPoint(Copy, Copy->begin()); 1166 AllocaInst *NewLd = 1167 Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace()); 1168 Builder.CreateMemCpy(NewLd, NewLd->getAlign(), 1169 Load->getPointerOperand(), Load->getAlign(), 1170 LoadLoc.Size.getValue()); 1171 Builder.SetInsertPoint(Fusion, Fusion->begin()); 1172 PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3); 1173 PHI->addIncoming(Load->getPointerOperand(), Check0); 1174 PHI->addIncoming(Load->getPointerOperand(), Check1); 1175 PHI->addIncoming(NewLd, Copy); 1176 1177 // Adjust DT. 1178 DTUpdates.push_back({DT->Insert, Check0, Check1}); 1179 DTUpdates.push_back({DT->Insert, Check0, Fusion}); 1180 DTUpdates.push_back({DT->Insert, Check1, Copy}); 1181 DTUpdates.push_back({DT->Insert, Check1, Fusion}); 1182 DT->applyUpdates(DTUpdates); 1183 return PHI; 1184 } 1185 1186 bool isFusionProfitable(CallInst *MatMul) { 1187 if (ForceFusion) 1188 return true; 1189 1190 ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); 1191 ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); 1192 1193 const unsigned R = LShape.NumRows; 1194 const unsigned C = RShape.NumColumns; 1195 const unsigned M = LShape.NumColumns; 1196 auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); 1197 1198 const unsigned VF = std::max<unsigned>( 1199 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 1200 .getFixedSize() / 1201 EltType->getPrimitiveSizeInBits().getFixedSize(), 1202 1U); 1203 1204 // Cost model for tiling 1205 // 1206 // For tiling to be beneficial, we need reuse either along the R or 1207 // the C axis. We vectorize along the R axis so that means at least 1208 // 3 elements. 1209 // TODO: Also consider cost of copying if operands alias. 1210 if (R <= VF && C == 1) 1211 return false; 1212 // Then we need enough elements to exceed the number of vector 1213 // registers we have. Note that this is an oversimplification since 1214 // fusing also takes some extra loads which may exceed the number of 1215 // reloads necessary. 1216 unsigned Op0Regs = (R + VF - 1) / VF * M; 1217 unsigned Op1Regs = (M + VF - 1) / VF * C; 1218 return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true); 1219 } 1220 1221 MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) { 1222 MatrixTy Res; 1223 auto *ColumType = FixedVectorType::get(EltType, R); 1224 for (unsigned I = 0; I < C; ++I) 1225 Res.addVector(ConstantAggregateZero::get(ColumType)); 1226 return Res; 1227 } 1228 1229 void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape, 1230 Value *RPtr, ShapeInfo RShape, StoreInst *Store, 1231 bool AllowContract) { 1232 auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); 1233 1234 // Create the main tiling loop nest. 1235 TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns, TileSize); 1236 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 1237 Instruction *InsertI = cast<Instruction>(MatMul); 1238 BasicBlock *Start = InsertI->getParent(); 1239 BasicBlock *End = 1240 SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue"); 1241 IRBuilder<> Builder(MatMul); 1242 BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, *LI); 1243 1244 Type *TileVecTy = 1245 FixedVectorType::get(MatMul->getType()->getScalarType(), TileSize); 1246 MatrixTy TileResult; 1247 // Insert in the inner loop header. 1248 Builder.SetInsertPoint(TI.InnerLoopHeader->getTerminator()); 1249 // Create PHI nodes for the result columns to accumulate across iterations. 1250 SmallVector<PHINode *, 4> ColumnPhis; 1251 for (unsigned I = 0; I < TileSize; I++) { 1252 auto *Phi = Builder.CreatePHI(TileVecTy, 2, "result.vec." + Twine(I)); 1253 Phi->addIncoming(ConstantAggregateZero::get(TileVecTy), 1254 TI.RowLoopHeader->getSingleSuccessor()); 1255 TileResult.addVector(Phi); 1256 ColumnPhis.push_back(Phi); 1257 } 1258 1259 // Insert in the inner loop body, which computes 1260 // Res += Load(CurrentRow, K) * Load(K, CurrentColumn) 1261 Builder.SetInsertPoint(InnerBody->getTerminator()); 1262 // Load tiles of the operands. 1263 MatrixTy A = loadMatrix(LPtr, {}, false, LShape, TI.CurrentRow, TI.CurrentK, 1264 {TileSize, TileSize}, EltType, Builder); 1265 MatrixTy B = loadMatrix(RPtr, {}, false, RShape, TI.CurrentK, TI.CurrentCol, 1266 {TileSize, TileSize}, EltType, Builder); 1267 emitMatrixMultiply(TileResult, A, B, AllowContract, Builder, true, false); 1268 // Store result after the inner loop is done. 1269 Builder.SetInsertPoint(TI.RowLoopLatch->getTerminator()); 1270 storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(), 1271 Store->isVolatile(), {LShape.NumRows, RShape.NumColumns}, 1272 TI.CurrentRow, TI.CurrentCol, EltType, Builder); 1273 1274 for (unsigned I = 0; I < TileResult.getNumVectors(); I++) 1275 ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.InnerLoopLatch); 1276 1277 // Force unrolling of a few iterations of the inner loop, to make sure there 1278 // is enough work per iteration. 1279 // FIXME: The unroller should make this decision directly instead, but 1280 // currently the cost-model is not up to the task. 1281 unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns / TileSize); 1282 addStringMetadataToLoop(LI->getLoopFor(TI.InnerLoopHeader), 1283 "llvm.loop.unroll.count", InnerLoopUnrollCount); 1284 } 1285 1286 void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1, 1287 StoreInst *Store, 1288 SmallPtrSetImpl<Instruction *> &FusedInsts) { 1289 assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && 1290 "Tiling only supported for column-major matrixes at the moment!"); 1291 if (!isFusionProfitable(MatMul)) 1292 return; 1293 1294 ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); 1295 ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); 1296 1297 const unsigned R = LShape.NumRows; 1298 const unsigned C = RShape.NumColumns; 1299 const unsigned M = LShape.NumColumns; 1300 auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); 1301 1302 Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul); 1303 Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul); 1304 Value *CPtr = Store->getPointerOperand(); 1305 1306 bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) && 1307 MatMul->hasAllowContract()); 1308 if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0)) 1309 createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store, 1310 AllowContract); 1311 else { 1312 IRBuilder<> Builder(Store); 1313 for (unsigned J = 0; J < C; J += TileSize) 1314 for (unsigned I = 0; I < R; I += TileSize) { 1315 const unsigned TileR = std::min(R - I, unsigned(TileSize)); 1316 const unsigned TileC = std::min(C - J, unsigned(TileSize)); 1317 MatrixTy Res = getZeroMatrix(EltType, TileR, TileC); 1318 1319 for (unsigned K = 0; K < M; K += TileSize) { 1320 const unsigned TileM = std::min(M - K, unsigned(TileSize)); 1321 MatrixTy A = 1322 loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(), 1323 LShape, Builder.getInt64(I), Builder.getInt64(K), 1324 {TileR, TileM}, EltType, Builder); 1325 MatrixTy B = 1326 loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(), 1327 RShape, Builder.getInt64(K), Builder.getInt64(J), 1328 {TileM, TileC}, EltType, Builder); 1329 emitMatrixMultiply(Res, A, B, AllowContract, Builder, true, false); 1330 } 1331 storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M}, 1332 Builder.getInt64(I), Builder.getInt64(J), EltType, 1333 Builder); 1334 } 1335 } 1336 1337 // Mark eliminated instructions as fused and remove them. 1338 FusedInsts.insert(Store); 1339 FusedInsts.insert(MatMul); 1340 Store->eraseFromParent(); 1341 MatMul->eraseFromParent(); 1342 if (LoadOp0->hasNUses(0)) { 1343 FusedInsts.insert(LoadOp0); 1344 LoadOp0->eraseFromParent(); 1345 } 1346 if (LoadOp1->hasNUses(0)) { 1347 FusedInsts.insert(LoadOp1); 1348 LoadOp1->eraseFromParent(); 1349 } 1350 } 1351 1352 /// Try to lower matrix multiply chains by fusing operations. 1353 /// 1354 /// Call finalizeLowering on lowered instructions. Instructions that are 1355 /// completely eliminated by fusion are added to \p FusedInsts. 1356 void LowerMatrixMultiplyFused(CallInst *MatMul, 1357 SmallPtrSetImpl<Instruction *> &FusedInsts) { 1358 if (!FuseMatrix || !DT) 1359 return; 1360 1361 assert(AA && LI && "Analyses should be available"); 1362 1363 Value *A = MatMul->getArgOperand(0); 1364 Value *B = MatMul->getArgOperand(1); 1365 1366 // We can fold the transpose into the operand that is used to fetch scalars. 1367 Value *T; 1368 if (MatrixLayout == MatrixLayoutTy::ColumnMajor 1369 ? match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(T))) 1370 : match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(T)))) { 1371 IRBuilder<> Builder(MatMul); 1372 auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); 1373 ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); 1374 ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); 1375 const unsigned R = LShape.NumRows; 1376 const unsigned M = LShape.NumColumns; 1377 const unsigned C = RShape.NumColumns; 1378 1379 MatrixTy MA; 1380 MatrixTy MB; 1381 1382 Value *Transpose; 1383 if (MatrixLayout == MatrixLayoutTy::ColumnMajor) { 1384 MA = getMatrix(A, ShapeInfo(R, M), Builder); 1385 MB = getMatrix(T, ShapeInfo(C, M), Builder); 1386 Transpose = B; 1387 } else { 1388 MA = getMatrix(T, ShapeInfo(R, M), Builder); 1389 MB = getMatrix(B, ShapeInfo(C, M), Builder); 1390 Transpose = A; 1391 } 1392 1393 // Initialize the output 1394 MatrixTy Result(R, C, EltType); 1395 1396 bool AllowContract = 1397 AllowContractEnabled || 1398 (isa<FPMathOperator>(MatMul) && MatMul->hasAllowContract()); 1399 emitMatrixMultiply(Result, MA, MB, AllowContract, Builder, false, true); 1400 1401 FusedInsts.insert(MatMul); 1402 FusedInsts.insert(cast<Instruction>(Transpose)); 1403 if (Transpose->hasOneUse()) 1404 ToRemove.push_back(cast<Instruction>(Transpose)); 1405 finalizeLowering(MatMul, Result, Builder); 1406 // TODO: add a fake entry for the folded instruction so that this is 1407 // included in the expression in the remark. 1408 Inst2ColumnMatrix[Transpose] = MatrixTy(M, C, EltType); 1409 return; 1410 } 1411 1412 if (!MatMul->hasOneUse() || MatrixLayout != MatrixLayoutTy::ColumnMajor) 1413 return; 1414 1415 // Lower {ld, ld} -> matmul -> st chains. No need to call finalizeLowering 1416 // since the single store user will be lowered as part of this. 1417 auto *LoadOp0 = dyn_cast<LoadInst>(A); 1418 auto *LoadOp1 = dyn_cast<LoadInst>(B); 1419 auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin()); 1420 if (LoadOp0 && LoadOp1 && Store) { 1421 // The store address must dominate the MatMul instruction, otherwise 1422 // we create invalid IR. 1423 // FIXME: See if we can hoist the store address computation. 1424 auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1)); 1425 if (AddrI && (!DT->dominates(AddrI, MatMul))) 1426 return; 1427 1428 emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts); 1429 return; 1430 } 1431 } 1432 1433 /// Lowers llvm.matrix.multiply. 1434 void LowerMultiply(CallInst *MatMul) { 1435 IRBuilder<> Builder(MatMul); 1436 auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); 1437 ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); 1438 ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); 1439 1440 const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder); 1441 const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder); 1442 assert(Lhs.getElementType() == Rhs.getElementType() && 1443 "Matrix multiply argument element types do not match."); 1444 1445 const unsigned R = LShape.NumRows; 1446 const unsigned C = RShape.NumColumns; 1447 assert(LShape.NumColumns == RShape.NumRows); 1448 1449 // Initialize the output 1450 MatrixTy Result(R, C, EltType); 1451 assert(Lhs.getElementType() == Result.getElementType() && 1452 "Matrix multiply result element type does not match arguments."); 1453 1454 bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) && 1455 MatMul->hasAllowContract()); 1456 emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false, false); 1457 finalizeLowering(MatMul, Result, Builder); 1458 } 1459 1460 /// Lowers llvm.matrix.transpose. 1461 void LowerTranspose(CallInst *Inst) { 1462 MatrixTy Result; 1463 IRBuilder<> Builder(Inst); 1464 Value *InputVal = Inst->getArgOperand(0); 1465 VectorType *VectorTy = cast<VectorType>(InputVal->getType()); 1466 ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2)); 1467 MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); 1468 1469 const unsigned NewNumVecs = 1470 InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns; 1471 const unsigned NewNumElts = 1472 InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows; 1473 1474 for (unsigned I = 0; I < NewNumVecs; ++I) { 1475 // Build a single result vector. First initialize it. 1476 Value *ResultVector = UndefValue::get( 1477 FixedVectorType::get(VectorTy->getElementType(), NewNumElts)); 1478 // Go through the old elements and insert it into the resulting vector. 1479 for (auto J : enumerate(InputMatrix.vectors())) { 1480 Value *Elt = Builder.CreateExtractElement(J.value(), I); 1481 // Row and column indices are transposed. 1482 ResultVector = 1483 Builder.CreateInsertElement(ResultVector, Elt, J.index()); 1484 } 1485 Result.addVector(ResultVector); 1486 } 1487 1488 // TODO: Improve estimate of operations needed for transposes. Currently we 1489 // just count the insertelement/extractelement instructions, but do not 1490 // account for later simplifications/combines. 1491 finalizeLowering( 1492 Inst, 1493 Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns), 1494 Builder); 1495 } 1496 1497 /// Lower load instructions, if shape information is available. 1498 bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) { 1499 auto I = ShapeMap.find(Inst); 1500 if (I == ShapeMap.end()) 1501 return false; 1502 1503 LowerLoad(Inst, Ptr, Inst->getAlign(), 1504 Builder.getInt64(I->second.getStride()), Inst->isVolatile(), 1505 I->second); 1506 return true; 1507 } 1508 1509 bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr, 1510 IRBuilder<> &Builder) { 1511 auto I = ShapeMap.find(StoredVal); 1512 if (I == ShapeMap.end()) 1513 return false; 1514 1515 LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(), 1516 Builder.getInt64(I->second.getStride()), Inst->isVolatile(), 1517 I->second); 1518 return true; 1519 } 1520 1521 /// Lower binary operators, if shape information is available. 1522 bool VisitBinaryOperator(BinaryOperator *Inst) { 1523 auto I = ShapeMap.find(Inst); 1524 if (I == ShapeMap.end()) 1525 return false; 1526 1527 Value *Lhs = Inst->getOperand(0); 1528 Value *Rhs = Inst->getOperand(1); 1529 1530 IRBuilder<> Builder(Inst); 1531 ShapeInfo &Shape = I->second; 1532 1533 MatrixTy Result; 1534 MatrixTy A = getMatrix(Lhs, Shape, Builder); 1535 MatrixTy B = getMatrix(Rhs, Shape, Builder); 1536 assert(A.isColumnMajor() == B.isColumnMajor() && 1537 Result.isColumnMajor() == A.isColumnMajor() && 1538 "operands must agree on matrix layout"); 1539 1540 // Helper to perform binary op on vectors. 1541 auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) { 1542 switch (Inst->getOpcode()) { 1543 case Instruction::Add: 1544 return Builder.CreateAdd(LHS, RHS); 1545 case Instruction::Mul: 1546 return Builder.CreateMul(LHS, RHS); 1547 case Instruction::Sub: 1548 return Builder.CreateSub(LHS, RHS); 1549 case Instruction::FAdd: 1550 return Builder.CreateFAdd(LHS, RHS); 1551 case Instruction::FMul: 1552 return Builder.CreateFMul(LHS, RHS); 1553 case Instruction::FSub: 1554 return Builder.CreateFSub(LHS, RHS); 1555 default: 1556 llvm_unreachable("Unsupported binary operator for matrix"); 1557 } 1558 }; 1559 1560 for (unsigned I = 0; I < Shape.getNumVectors(); ++I) 1561 Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I))); 1562 1563 finalizeLowering(Inst, 1564 Result.addNumComputeOps(getNumOps(Result.getVectorTy()) * 1565 Result.getNumVectors()), 1566 Builder); 1567 return true; 1568 } 1569 1570 /// Lower unary operators, if shape information is available. 1571 bool VisitUnaryOperator(UnaryOperator *Inst) { 1572 auto I = ShapeMap.find(Inst); 1573 if (I == ShapeMap.end()) 1574 return false; 1575 1576 Value *Op = Inst->getOperand(0); 1577 1578 IRBuilder<> Builder(Inst); 1579 ShapeInfo &Shape = I->second; 1580 1581 MatrixTy Result; 1582 MatrixTy M = getMatrix(Op, Shape, Builder); 1583 1584 // Helper to perform unary op on vectors. 1585 auto BuildVectorOp = [&Builder, Inst](Value *Op) { 1586 switch (Inst->getOpcode()) { 1587 case Instruction::FNeg: 1588 return Builder.CreateFNeg(Op); 1589 default: 1590 llvm_unreachable("Unsupported unary operator for matrix"); 1591 } 1592 }; 1593 1594 for (unsigned I = 0; I < Shape.getNumVectors(); ++I) 1595 Result.addVector(BuildVectorOp(M.getVector(I))); 1596 1597 finalizeLowering(Inst, 1598 Result.addNumComputeOps(getNumOps(Result.getVectorTy()) * 1599 Result.getNumVectors()), 1600 Builder); 1601 return true; 1602 } 1603 1604 /// Helper to linearize a matrix expression tree into a string. Currently 1605 /// matrix expressions are linarized by starting at an expression leaf and 1606 /// linearizing bottom up. 1607 struct ExprLinearizer { 1608 unsigned LengthToBreak = 100; 1609 std::string Str; 1610 raw_string_ostream Stream; 1611 unsigned LineLength = 0; 1612 const DataLayout &DL; 1613 1614 /// Mapping from instructions to matrixes. It is used to identify 1615 /// matrix instructions. 1616 const MapVector<Value *, MatrixTy> &Inst2Matrix; 1617 1618 /// Mapping from values to the leaves of all expressions that the value is 1619 /// part of. 1620 const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared; 1621 1622 /// Set of matrix expressions in the scope of a given DISubprogram. 1623 const SmallSetVector<Value *, 32> &ExprsInSubprogram; 1624 1625 /// Leaf node of the expression to linearize. 1626 Value *Leaf; 1627 1628 /// Used to keep track of sub-expressions that get reused while linearizing 1629 /// the expression. Re-used sub-expressions are marked as (reused). 1630 SmallPtrSet<Value *, 8> ReusedExprs; 1631 1632 ExprLinearizer(const DataLayout &DL, 1633 const MapVector<Value *, MatrixTy> &Inst2Matrix, 1634 const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared, 1635 const SmallSetVector<Value *, 32> &ExprsInSubprogram, 1636 Value *Leaf) 1637 : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared), 1638 ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {} 1639 1640 void indent(unsigned N) { 1641 LineLength += N; 1642 for (unsigned i = 0; i < N; i++) 1643 Stream << " "; 1644 } 1645 1646 void lineBreak() { 1647 Stream << "\n"; 1648 LineLength = 0; 1649 } 1650 1651 void maybeIndent(unsigned Indent) { 1652 if (LineLength >= LengthToBreak) 1653 lineBreak(); 1654 1655 if (LineLength == 0) 1656 indent(Indent); 1657 } 1658 1659 void write(StringRef S) { 1660 LineLength += S.size(); 1661 Stream << S; 1662 } 1663 1664 Value *getUnderlyingObjectThroughLoads(Value *V) { 1665 if (Value *Ptr = getPointerOperand(V)) 1666 return getUnderlyingObjectThroughLoads(Ptr); 1667 else if (V->getType()->isPointerTy()) 1668 return getUnderlyingObject(V); 1669 return V; 1670 } 1671 1672 /// Returns true if \p V is a matrix value in the given subprogram. 1673 bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); } 1674 1675 /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to 1676 /// \p SS. 1677 void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) { 1678 auto M = Inst2Matrix.find(V); 1679 if (M == Inst2Matrix.end()) 1680 SS << "unknown"; 1681 else { 1682 SS << M->second.getNumRows(); 1683 SS << "x"; 1684 SS << M->second.getNumColumns(); 1685 } 1686 } 1687 1688 /// Write the called function name. Handles calls to llvm.matrix.* 1689 /// specially: we write the name, followed by the dimensions of the input 1690 /// matrixes, followed by the scalar type name. 1691 void writeFnName(CallInst *CI) { 1692 if (!CI->getCalledFunction()) 1693 write("<no called fn>"); 1694 else { 1695 StringRef Name = CI->getCalledFunction()->getName(); 1696 if (!Name.startswith("llvm.matrix")) { 1697 write(Name); 1698 return; 1699 } 1700 IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); 1701 write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {})) 1702 .drop_front(StringRef("llvm.matrix.").size())); 1703 write("."); 1704 std::string Tmp; 1705 raw_string_ostream SS(Tmp); 1706 1707 switch (II->getIntrinsicID()) { 1708 case Intrinsic::matrix_multiply: 1709 prettyPrintMatrixType(II->getOperand(0), SS); 1710 SS << "."; 1711 prettyPrintMatrixType(II->getOperand(1), SS); 1712 SS << "." << *II->getType()->getScalarType(); 1713 break; 1714 case Intrinsic::matrix_transpose: 1715 prettyPrintMatrixType(II->getOperand(0), SS); 1716 SS << "." << *II->getType()->getScalarType(); 1717 break; 1718 case Intrinsic::matrix_column_major_load: 1719 prettyPrintMatrixType(II, SS); 1720 SS << "." << *II->getType()->getScalarType(); 1721 break; 1722 case Intrinsic::matrix_column_major_store: 1723 prettyPrintMatrixType(II->getOperand(0), SS); 1724 SS << "." << *II->getOperand(0)->getType()->getScalarType(); 1725 break; 1726 default: 1727 llvm_unreachable("Unhandled case"); 1728 } 1729 SS.flush(); 1730 write(Tmp); 1731 } 1732 } 1733 1734 unsigned getNumShapeArgs(CallInst *CI) const { 1735 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { 1736 switch (II->getIntrinsicID()) { 1737 case Intrinsic::matrix_multiply: 1738 return 3; 1739 case Intrinsic::matrix_transpose: 1740 return 2; 1741 case Intrinsic::matrix_column_major_load: 1742 case Intrinsic::matrix_column_major_store: 1743 return 3; 1744 default: 1745 return 0; 1746 } 1747 } 1748 return 0; 1749 } 1750 1751 /// Special printing for values: for pointers, we print if they refer to an 1752 /// (function) external address or a stack address, for other values we 1753 /// either print the constant or "scalar"/"matrix" for other values. 1754 void write(Value *V) { 1755 V = getUnderlyingObjectThroughLoads(V); 1756 if (V->getType()->isPointerTy()) { 1757 if (isa<AllocaInst>(V)) { 1758 Stream << "stack addr"; 1759 LineLength += StringRef("stack addr").size(); 1760 } else { 1761 Stream << "addr"; 1762 LineLength += StringRef("addr").size(); 1763 } 1764 if (!V->getName().empty()) { 1765 Stream << " %" << V->getName() << ""; 1766 LineLength += V->getName().size() + 2; 1767 } 1768 return; 1769 } 1770 1771 std::string Tmp; 1772 raw_string_ostream TmpStream(Tmp); 1773 1774 if (auto *CI = dyn_cast<ConstantInt>(V)) 1775 TmpStream << CI->getValue(); 1776 else if (isa<Constant>(V)) 1777 TmpStream << "constant"; 1778 else { 1779 if (isMatrix(V)) 1780 TmpStream << "matrix"; 1781 else 1782 TmpStream << "scalar"; 1783 } 1784 TmpStream.flush(); 1785 Tmp = std::string(StringRef(Tmp).trim()); 1786 LineLength += Tmp.size(); 1787 Stream << Tmp; 1788 } 1789 1790 /// Linearize expression \p Expr starting at an indentation of \p Indent. 1791 /// Expressions that are re-used multiple times are prefixed with (reused) 1792 /// at the re-used root instruction. 1793 void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused, 1794 bool ParentShared) { 1795 auto *I = cast<Instruction>(Expr); 1796 maybeIndent(Indent); 1797 SmallVector<Value *, 8> Ops; 1798 1799 // Is Expr shared with other expression leaves? 1800 bool ExprShared = false; 1801 1802 // Deal with shared subtrees. Mark them as shared, if required. 1803 if (!ParentShared) { 1804 auto SI = Shared.find(Expr); 1805 assert(SI != Shared.end() && SI->second.count(Leaf)); 1806 1807 for (Value *S : SI->second) { 1808 if (S == Leaf) 1809 continue; 1810 DebugLoc DL = cast<Instruction>(S)->getDebugLoc(); 1811 write("shared with remark at line " + std::to_string(DL.getLine()) + 1812 " column " + std::to_string(DL.getCol()) + " ("); 1813 } 1814 ExprShared = SI->second.size() > 1; 1815 } 1816 1817 bool Reused = !ReusedExprs.insert(Expr).second; 1818 if (Reused && !ParentReused) 1819 write("(reused) "); 1820 1821 if (auto *CI = dyn_cast<CallInst>(I)) { 1822 writeFnName(CI); 1823 1824 Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI)); 1825 } else if (isa<BitCastInst>(Expr)) { 1826 // Special case bitcasts, which are used to materialize matrixes from 1827 // non-matrix ops. 1828 write("matrix"); 1829 return; 1830 } else { 1831 Ops.append(I->value_op_begin(), I->value_op_end()); 1832 write(std::string(I->getOpcodeName())); 1833 } 1834 1835 write(std::string("(")); 1836 1837 unsigned NumOpsToBreak = 1; 1838 if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>())) 1839 NumOpsToBreak = 2; 1840 1841 for (Value *Op : Ops) { 1842 if (Ops.size() > NumOpsToBreak) 1843 lineBreak(); 1844 1845 maybeIndent(Indent + 1); 1846 if (isMatrix(Op)) 1847 linearizeExpr(Op, Indent + 1, Reused, ExprShared); 1848 else 1849 write(Op); 1850 if (Op != Ops.back()) 1851 write(", "); 1852 } 1853 1854 write(")"); 1855 } 1856 1857 const std::string &getResult() { 1858 Stream.flush(); 1859 return Str; 1860 } 1861 }; 1862 1863 /// Generate remarks for matrix operations in a function. To generate remarks 1864 /// for matrix expressions, the following approach is used: 1865 /// 1. Use the inlined-at debug information to group matrix operations to the 1866 /// DISubprograms they are contained in. 1867 /// 2. Collect leaves of matrix expressions (done in 1868 /// RemarkGenerator::getExpressionLeaves) for each subprogram - expression 1869 // mapping. Leaves are lowered matrix instructions without other matrix 1870 // users (like stores) in the current subprogram. 1871 /// 3. For each leaf, create a remark containing a linearizied version of the 1872 /// matrix expression. The expression is linearized by a recursive 1873 /// bottom-up traversal of the matrix operands, starting at a leaf. Note 1874 /// that multiple leaves can share sub-expressions. Shared subexpressions 1875 /// are explicitly marked as shared(). 1876 struct RemarkGenerator { 1877 const MapVector<Value *, MatrixTy> &Inst2Matrix; 1878 OptimizationRemarkEmitter &ORE; 1879 Function &Func; 1880 const DataLayout &DL; 1881 1882 RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix, 1883 OptimizationRemarkEmitter &ORE, Function &Func) 1884 : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func), 1885 DL(Func.getParent()->getDataLayout()) {} 1886 1887 /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are 1888 /// instructions in Inst2Matrix returning void or without any users in 1889 /// \p ExprsInSubprogram. Currently that should only include stores. 1890 SmallVector<Value *, 4> 1891 getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) { 1892 SmallVector<Value *, 4> Leaves; 1893 for (auto *Expr : ExprsInSubprogram) 1894 if (Expr->getType()->isVoidTy() || 1895 !any_of(Expr->users(), [&ExprsInSubprogram](User *U) { 1896 return ExprsInSubprogram.count(U); 1897 })) 1898 Leaves.push_back(Expr); 1899 return Leaves; 1900 } 1901 1902 /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf 1903 /// to all visited expressions in \p Shared. Limit the matrix operations to 1904 /// the ones in \p ExprsInSubprogram. 1905 void collectSharedInfo(Value *Leaf, Value *V, 1906 const SmallSetVector<Value *, 32> &ExprsInSubprogram, 1907 DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) { 1908 1909 if (!ExprsInSubprogram.count(V)) 1910 return; 1911 1912 auto I = Shared.insert({V, {}}); 1913 I.first->second.insert(Leaf); 1914 1915 for (Value *Op : cast<Instruction>(V)->operand_values()) 1916 collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared); 1917 } 1918 1919 /// Calculate the number of exclusive and shared op counts for expression 1920 /// starting at \p V. Expressions used multiple times are counted once. 1921 /// Limit the matrix operations to the ones in \p ExprsInSubprogram. 1922 std::pair<OpInfoTy, OpInfoTy> 1923 sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs, 1924 const SmallSetVector<Value *, 32> &ExprsInSubprogram, 1925 DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const { 1926 if (!ExprsInSubprogram.count(Root)) 1927 return {}; 1928 1929 // Already counted this expression. Stop. 1930 if (!ReusedExprs.insert(Root).second) 1931 return {}; 1932 1933 OpInfoTy SharedCount; 1934 OpInfoTy Count; 1935 1936 auto I = Shared.find(Root); 1937 auto CM = Inst2Matrix.find(Root); 1938 if (I->second.size() == 1) 1939 Count = CM->second.getOpInfo(); 1940 else 1941 SharedCount = CM->second.getOpInfo(); 1942 1943 for (Value *Op : cast<Instruction>(Root)->operand_values()) { 1944 auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared); 1945 Count += C.first; 1946 SharedCount += C.second; 1947 } 1948 return {Count, SharedCount}; 1949 } 1950 1951 void emitRemarks() { 1952 if (!ORE.allowExtraAnalysis(DEBUG_TYPE)) 1953 return; 1954 1955 // Map matrix operations to their containting subprograms, by traversing 1956 // the inlinedAt chain. If the function does not have a DISubprogram, we 1957 // only map them to the containing function. 1958 MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs; 1959 for (auto &KV : Inst2Matrix) { 1960 if (Func.getSubprogram()) { 1961 auto *I = cast<Instruction>(KV.first); 1962 DILocation *Context = I->getDebugLoc(); 1963 while (Context) { 1964 auto I = 1965 Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}}); 1966 I.first->second.push_back(KV.first); 1967 Context = DebugLoc(Context).getInlinedAt(); 1968 } 1969 } else { 1970 auto I = Subprog2Exprs.insert({nullptr, {}}); 1971 I.first->second.push_back(KV.first); 1972 } 1973 } 1974 for (auto &KV : Subprog2Exprs) { 1975 SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(), 1976 KV.second.end()); 1977 auto Leaves = getExpressionLeaves(ExprsInSubprogram); 1978 1979 DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared; 1980 for (Value *Leaf : Leaves) 1981 collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared); 1982 1983 // Generate remarks for each leaf. 1984 for (auto *L : Leaves) { 1985 1986 DebugLoc Loc = cast<Instruction>(L)->getDebugLoc(); 1987 DILocation *Context = cast<Instruction>(L)->getDebugLoc(); 1988 while (Context) { 1989 if (getSubprogram(Context->getScope()) == KV.first) { 1990 Loc = Context; 1991 break; 1992 } 1993 Context = DebugLoc(Context).getInlinedAt(); 1994 } 1995 1996 SmallPtrSet<Value *, 8> ReusedExprs; 1997 OpInfoTy Counts, SharedCounts; 1998 std::tie(Counts, SharedCounts) = 1999 sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared); 2000 2001 OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc, 2002 cast<Instruction>(L)->getParent()); 2003 2004 Rem << "Lowered with "; 2005 Rem << ore::NV("NumStores", Counts.NumStores) << " stores, " 2006 << ore::NV("NumLoads", Counts.NumLoads) << " loads, " 2007 << ore::NV("NumComputeOps", Counts.NumComputeOps) 2008 << " compute ops"; 2009 2010 if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 || 2011 SharedCounts.NumComputeOps > 0) { 2012 Rem << ",\nadditionally " 2013 << ore::NV("NumStores", SharedCounts.NumStores) << " stores, " 2014 << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, " 2015 << ore::NV("NumFPOps", SharedCounts.NumComputeOps) 2016 << " compute ops" 2017 << " are shared with other expressions"; 2018 } 2019 2020 Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL)); 2021 ORE.emit(Rem); 2022 } 2023 } 2024 } 2025 2026 std::string 2027 linearize(Value *L, 2028 const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared, 2029 const SmallSetVector<Value *, 32> &ExprsInSubprogram, 2030 const DataLayout &DL) { 2031 ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L); 2032 Lin.linearizeExpr(L, 0, false, false); 2033 return Lin.getResult(); 2034 } 2035 }; 2036 }; 2037 } // namespace 2038 2039 PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F, 2040 FunctionAnalysisManager &AM) { 2041 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 2042 OptimizationRemarkEmitter *ORE = nullptr; 2043 AAResults *AA = nullptr; 2044 DominatorTree *DT = nullptr; 2045 LoopInfo *LI = nullptr; 2046 2047 if (!Minimal) { 2048 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 2049 AA = &AM.getResult<AAManager>(F); 2050 DT = &AM.getResult<DominatorTreeAnalysis>(F); 2051 LI = &AM.getResult<LoopAnalysis>(F); 2052 } 2053 2054 LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE); 2055 if (LMT.Visit()) { 2056 PreservedAnalyses PA; 2057 if (!Minimal) { 2058 PA.preserve<LoopAnalysis>(); 2059 PA.preserve<DominatorTreeAnalysis>(); 2060 } 2061 return PA; 2062 } 2063 return PreservedAnalyses::all(); 2064 } 2065 2066 namespace { 2067 2068 class LowerMatrixIntrinsicsLegacyPass : public FunctionPass { 2069 public: 2070 static char ID; 2071 2072 LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) { 2073 initializeLowerMatrixIntrinsicsLegacyPassPass( 2074 *PassRegistry::getPassRegistry()); 2075 } 2076 2077 bool runOnFunction(Function &F) override { 2078 auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2079 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2080 auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 2081 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2082 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2083 LowerMatrixIntrinsics LMT(F, TTI, &AA, &DT, &LI, &ORE); 2084 bool C = LMT.Visit(); 2085 return C; 2086 } 2087 2088 void getAnalysisUsage(AnalysisUsage &AU) const override { 2089 AU.addRequired<TargetTransformInfoWrapperPass>(); 2090 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2091 AU.addRequired<AAResultsWrapperPass>(); 2092 AU.addRequired<DominatorTreeWrapperPass>(); 2093 AU.addPreserved<DominatorTreeWrapperPass>(); 2094 AU.addRequired<LoopInfoWrapperPass>(); 2095 AU.addPreserved<LoopInfoWrapperPass>(); 2096 } 2097 }; 2098 } // namespace 2099 2100 static const char pass_name[] = "Lower the matrix intrinsics"; 2101 char LowerMatrixIntrinsicsLegacyPass::ID = 0; 2102 INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, 2103 false, false) 2104 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 2105 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 2106 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 2107 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 2108 INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, 2109 false, false) 2110 2111 Pass *llvm::createLowerMatrixIntrinsicsPass() { 2112 return new LowerMatrixIntrinsicsLegacyPass(); 2113 } 2114 2115 namespace { 2116 2117 /// A lightweight version of the matrix lowering pass that only requires TTI. 2118 /// Advanced features that require DT, AA or ORE like tiling are disabled. This 2119 /// is used to lower matrix intrinsics if the main lowering pass is not run, for 2120 /// example with -O0. 2121 class LowerMatrixIntrinsicsMinimalLegacyPass : public FunctionPass { 2122 public: 2123 static char ID; 2124 2125 LowerMatrixIntrinsicsMinimalLegacyPass() : FunctionPass(ID) { 2126 initializeLowerMatrixIntrinsicsMinimalLegacyPassPass( 2127 *PassRegistry::getPassRegistry()); 2128 } 2129 2130 bool runOnFunction(Function &F) override { 2131 auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2132 LowerMatrixIntrinsics LMT(F, TTI, nullptr, nullptr, nullptr, nullptr); 2133 bool C = LMT.Visit(); 2134 return C; 2135 } 2136 2137 void getAnalysisUsage(AnalysisUsage &AU) const override { 2138 AU.addRequired<TargetTransformInfoWrapperPass>(); 2139 AU.setPreservesCFG(); 2140 } 2141 }; 2142 } // namespace 2143 2144 static const char pass_name_minimal[] = "Lower the matrix intrinsics (minimal)"; 2145 char LowerMatrixIntrinsicsMinimalLegacyPass::ID = 0; 2146 INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsMinimalLegacyPass, 2147 "lower-matrix-intrinsics-minimal", pass_name_minimal, 2148 false, false) 2149 INITIALIZE_PASS_END(LowerMatrixIntrinsicsMinimalLegacyPass, 2150 "lower-matrix-intrinsics-minimal", pass_name_minimal, false, 2151 false) 2152 2153 Pass *llvm::createLowerMatrixIntrinsicsMinimalPass() { 2154 return new LowerMatrixIntrinsicsMinimalLegacyPass(); 2155 } 2156