1 //===- VectorToGPU.cpp - Convert vector to GPU dialect ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements lowering of vector operations to GPU dialect ops.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <type_traits>
14
15 #include "NvGpuSupport.h"
16 #include "mlir/Conversion/VectorToGPU/VectorToGPU.h"
17
18 #include "../PassDetail.h"
19 #include "mlir/Analysis/SliceAnalysis.h"
20 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
21 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
22 #include "mlir/Dialect/MemRef/IR/MemRef.h"
23 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
24 #include "mlir/Dialect/SCF/IR/SCF.h"
25 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
26 #include "mlir/Dialect/Vector/IR/VectorOps.h"
27 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
28 #include "mlir/IR/Builders.h"
29 #include "mlir/Pass/Pass.h"
30 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
31 #include "mlir/Transforms/Passes.h"
32 #include "llvm/ADT/TypeSwitch.h"
33
34 using namespace mlir;
35
36 /// For a vector TransferOpType `xferOp`, an empty `indices` vector, and an
37 /// AffineMap representing offsets to apply to indices, the function fills
38 /// `indices` with the original indices plus the offsets. The offsets are
39 /// applied by taking into account the permutation map of the transfer op. If
40 /// the `offsetMap` has dimension placeholders, those should be provided in
41 /// `dimValues`.
42 template <typename TransferOpType>
getXferIndices(OpBuilder & b,TransferOpType xferOp,AffineMap offsetMap,ArrayRef<Value> dimValues,SmallVector<Value,4> & indices)43 static void getXferIndices(OpBuilder &b, TransferOpType xferOp,
44 AffineMap offsetMap, ArrayRef<Value> dimValues,
45 SmallVector<Value, 4> &indices) {
46 indices.append(xferOp.getIndices().begin(), xferOp.getIndices().end());
47 Location loc = xferOp.getLoc();
48 unsigned offsetsIdx = 0;
49 for (auto expr : xferOp.getPermutationMap().getResults()) {
50 if (auto dim = expr.template dyn_cast<AffineDimExpr>()) {
51 Value prevIdx = indices[dim.getPosition()];
52 SmallVector<Value, 3> dims(dimValues.begin(), dimValues.end());
53 dims.push_back(prevIdx);
54 AffineExpr d0 = b.getAffineDimExpr(offsetMap.getNumDims());
55 indices[dim.getPosition()] = makeComposedAffineApply(
56 b, loc, d0 + offsetMap.getResult(offsetsIdx++), dims);
57 continue;
58 }
59 }
60 }
61
62 // Return true if the contract op can be convert to MMA matmul.
contractSupportsMMAMatrixType(vector::ContractionOp contract,bool useNvGpu)63 static bool contractSupportsMMAMatrixType(vector::ContractionOp contract,
64 bool useNvGpu) {
65 if (llvm::size(contract.getMasks()) != 0)
66 return false;
67
68 using MapList = ArrayRef<ArrayRef<AffineExpr>>;
69 auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
70 AffineExpr m, n, k;
71 bindDims(contract.getContext(), m, n, k);
72 auto iteratorTypes = contract.getIteratorTypes().getValue();
73 if (!(isParallelIterator(iteratorTypes[0]) &&
74 isParallelIterator(iteratorTypes[1]) &&
75 isReductionIterator(iteratorTypes[2])))
76 return false;
77
78 // The contract needs to represent a matmul to be able to convert to
79 // MMAMatrix matmul.
80 if (!useNvGpu &&
81 contract.getIndexingMapsArray() != infer({{m, k}, {k, n}, {m, n}}))
82 return false;
83 if (useNvGpu &&
84 contract.getIndexingMapsArray() != infer({{m, k}, {n, k}, {m, n}}))
85 return false;
86
87 return true;
88 }
89
90 // Return the stide for the dimension 0 of |type| if it is a memref and has a
91 // constant stride.
92 static llvm::Optional<int64_t>
getMemrefConstantHorizontalStride(ShapedType type)93 getMemrefConstantHorizontalStride(ShapedType type) {
94 auto memrefType = type.dyn_cast<MemRefType>();
95 if (!memrefType)
96 return false;
97 // If the memref is 0 or 1D the horizontal stride is 0.
98 if (memrefType.getRank() < 2)
99 return 0;
100 int64_t offset = 0;
101 SmallVector<int64_t, 2> strides;
102 if (failed(getStridesAndOffset(memrefType, strides, offset)) ||
103 strides.back() != 1)
104 return llvm::None;
105 int64_t stride = strides[strides.size() - 2];
106 if (stride == ShapedType::kDynamicStrideOrOffset)
107 return llvm::None;
108 return stride;
109 }
110
111 // Return true if the transfer op can be converted to a MMA matrix load.
transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,bool useNvGpu)112 static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,
113 bool useNvGpu) {
114 if (readOp.getMask() || readOp.hasOutOfBoundsDim() ||
115 readOp.getVectorType().getRank() != 2)
116 return false;
117 if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))
118 return false;
119 AffineMap map = readOp.getPermutationMap();
120 OpBuilder b(readOp.getContext());
121 AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);
122 AffineExpr zero = b.getAffineConstantExpr(0);
123 auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},
124 readOp.getContext());
125
126 if (!useNvGpu) {
127 // TODO: Support transpose once it is added to GPU dialect ops.
128 // For now we only support (d0, d1) -> (d0, d1) and (d0, d1) -> (0, d1).
129 return map.isMinorIdentity() || map == broadcastInnerDim;
130 }
131
132 return true;
133 }
134
135 // Return true if the transfer op can be converted to a MMA matrix store.
136 static bool
transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp)137 transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {
138 // TODO: support 0-d corner case.
139 if (writeOp.getTransferRank() == 0)
140 return false;
141
142 if (writeOp.getMask() || writeOp.hasOutOfBoundsDim() ||
143 writeOp.getVectorType().getRank() != 2)
144 return false;
145 if (!getMemrefConstantHorizontalStride(writeOp.getShapedType()))
146 return false;
147 // TODO: Support transpose once it is added to GPU dialect ops.
148 if (!writeOp.getPermutationMap().isMinorIdentity())
149 return false;
150 return true;
151 }
152
153 /// Return true if the constant is a splat to a 2D vector so that it can be
154 /// converted to a MMA constant matrix op.
constantSupportsMMAMatrixType(arith::ConstantOp constantOp)155 static bool constantSupportsMMAMatrixType(arith::ConstantOp constantOp) {
156 auto vecType = constantOp.getType().dyn_cast<VectorType>();
157 if (!vecType || vecType.getRank() != 2)
158 return false;
159 return constantOp.getValue().isa<SplatElementsAttr>();
160 }
161
162 /// Return true if this is a broadcast from scalar to a 2D vector.
broadcastSupportsMMAMatrixType(vector::BroadcastOp broadcastOp)163 static bool broadcastSupportsMMAMatrixType(vector::BroadcastOp broadcastOp) {
164 return broadcastOp.getVectorType().getRank() == 2 &&
165 broadcastOp.getSource().getType().isa<FloatType>();
166 }
167
168 /// Return the MMA elementwise enum associated with `op` if it is supported.
169 /// Return `llvm::None` otherwise.
170 static llvm::Optional<gpu::MMAElementwiseOp>
convertElementwiseOpToMMA(Operation * op)171 convertElementwiseOpToMMA(Operation *op) {
172 if (isa<arith::AddFOp>(op))
173 return gpu::MMAElementwiseOp::ADDF;
174 if (isa<arith::MulFOp>(op))
175 return gpu::MMAElementwiseOp::MULF;
176 if (isa<arith::MaxFOp>(op))
177 return gpu::MMAElementwiseOp::MAXF;
178 if (isa<arith::MinFOp>(op))
179 return gpu::MMAElementwiseOp::MINF;
180 if (isa<arith::DivFOp>(op))
181 return gpu::MMAElementwiseOp::DIVF;
182 return llvm::None;
183 }
184
185 /// Return true if the op is supported as elementwise op on MMAMatrix type.
elementwiseSupportsMMAMatrixType(Operation * op)186 static bool elementwiseSupportsMMAMatrixType(Operation *op) {
187 return convertElementwiseOpToMMA(op).has_value();
188 }
189
supportsMMaMatrixType(Operation * op,bool useNvGpu)190 static bool supportsMMaMatrixType(Operation *op, bool useNvGpu) {
191 if (isa<scf::ForOp, scf::YieldOp>(op))
192 return true;
193 if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))
194 return transferReadSupportsMMAMatrixType(transferRead, useNvGpu);
195 if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))
196 return transferWriteSupportsMMAMatrixType(transferWrite);
197 if (auto contract = dyn_cast<vector::ContractionOp>(op))
198 return contractSupportsMMAMatrixType(contract, useNvGpu);
199 if (auto constant = dyn_cast<arith::ConstantOp>(op))
200 return constantSupportsMMAMatrixType(constant);
201 if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))
202 return broadcastSupportsMMAMatrixType(broadcast);
203 return elementwiseSupportsMMAMatrixType(op);
204 }
205
206 /// Return an unsorted slice handling scf.for region differently than
207 /// `getSlice`. In scf.for we only want to include as part of the slice elements
208 /// that are part of the use/def chain.
getSliceContract(Operation * op,TransitiveFilter backwardFilter,TransitiveFilter forwardFilter)209 static SetVector<Operation *> getSliceContract(Operation *op,
210 TransitiveFilter backwardFilter,
211 TransitiveFilter forwardFilter) {
212 SetVector<Operation *> slice;
213 slice.insert(op);
214 unsigned currentIndex = 0;
215 SetVector<Operation *> backwardSlice;
216 SetVector<Operation *> forwardSlice;
217 while (currentIndex != slice.size()) {
218 auto *currentOp = (slice)[currentIndex];
219 // Compute and insert the backwardSlice starting from currentOp.
220 backwardSlice.clear();
221 getBackwardSlice(currentOp, &backwardSlice, backwardFilter);
222 slice.insert(backwardSlice.begin(), backwardSlice.end());
223
224 // Compute and insert the forwardSlice starting from currentOp.
225 forwardSlice.clear();
226 // Special case for ForOp, we don't want to include the whole region but
227 // only the value using the region arguments.
228 // TODO: We should refine this to only care about the region arguments being
229 // converted to matrix type.
230 if (auto forOp = dyn_cast<scf::ForOp>(currentOp)) {
231 for (Value forOpResult : forOp.getResults())
232 getForwardSlice(forOpResult, &forwardSlice, forwardFilter);
233 for (BlockArgument &arg : forOp.getRegionIterArgs())
234 getForwardSlice(arg, &forwardSlice, forwardFilter);
235 } else {
236 getForwardSlice(currentOp, &forwardSlice, forwardFilter);
237 }
238 slice.insert(forwardSlice.begin(), forwardSlice.end());
239 ++currentIndex;
240 }
241 return slice;
242 }
243
244 // Analyze slice of operations based on convert op to figure out if the whole
245 // slice can be converted to MMA operations.
getOpToConvert(mlir::Operation * op,bool useNvGpu)246 static SetVector<Operation *> getOpToConvert(mlir::Operation *op,
247 bool useNvGpu) {
248 auto hasVectorDest = [](Operation *op) {
249 return llvm::any_of(op->getResultTypes(),
250 [](Type t) { return t.isa<VectorType>(); });
251 };
252 auto hasVectorSrc = [](Operation *op) {
253 return llvm::any_of(op->getOperandTypes(),
254 [](Type t) { return t.isa<VectorType>(); });
255 };
256 SetVector<Operation *> opToConvert;
257 op->walk([&](vector::ContractionOp contract) {
258 if (opToConvert.contains(contract.getOperation()))
259 return;
260 SetVector<Operation *> dependentOps =
261 getSliceContract(contract, hasVectorDest, hasVectorSrc);
262 // If any instruction cannot use MMA matrix type drop the whole
263 // chain. MMA matrix are stored in an opaque type so they cannot be used
264 // by all operations.
265 if (llvm::any_of(dependentOps, [useNvGpu](Operation *op) {
266 return !supportsMMaMatrixType(op, useNvGpu);
267 }))
268 return;
269 opToConvert.insert(dependentOps.begin(), dependentOps.end());
270 });
271 // Sort the operations so that we can convert them in topological order.
272 return topologicalSort(opToConvert);
273 }
274
275 namespace {
276 // Transform contract into (m, k)x(k, n)x(m, n) form so that it can be converted
277 // to MMA matmul.
278 struct PrepareContractToGPUMMA
279 : public OpRewritePattern<vector::ContractionOp> {
280 using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
281
matchAndRewrite__anon38edb3010811::PrepareContractToGPUMMA282 LogicalResult matchAndRewrite(vector::ContractionOp op,
283 PatternRewriter &rewriter) const override {
284 Location loc = op.getLoc();
285 Value lhs = op.getLhs(), rhs = op.getRhs(), res = op.getAcc();
286
287 // Set up the parallel/reduction structure in right form.
288 using MapList = ArrayRef<ArrayRef<AffineExpr>>;
289 auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
290 AffineExpr m, n, k;
291 bindDims(rewriter.getContext(), m, n, k);
292 static constexpr std::array<int64_t, 2> perm = {1, 0};
293 auto iteratorTypes = op.getIteratorTypes().getValue();
294 SmallVector<AffineMap, 4> maps = op.getIndexingMapsArray();
295 if (!(isParallelIterator(iteratorTypes[0]) &&
296 isParallelIterator(iteratorTypes[1]) &&
297 isReductionIterator(iteratorTypes[2])))
298 return failure();
299 //
300 // Two outer parallel, one inner reduction (matmat flavor).
301 //
302 if (maps == infer({{m, k}, {k, n}, {m, n}})) {
303 // This is the classical row-major matmul, nothing to do.
304 return failure();
305 }
306 if (maps == infer({{m, k}, {n, k}, {m, n}})) {
307 rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
308 } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
309 lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
310 } else if (maps == infer({{k, m}, {n, k}, {m, n}})) {
311 rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
312 lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
313 } else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
314 std::swap(rhs, lhs);
315 rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
316 lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
317 } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
318 std::swap(rhs, lhs);
319 rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
320 } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
321 std::swap(lhs, rhs);
322 lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
323 } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
324 std::swap(lhs, rhs);
325 } else {
326 return failure();
327 }
328 rewriter.replaceOpWithNewOp<vector::ContractionOp>(
329 op, lhs, rhs, res,
330 rewriter.getAffineMapArrayAttr(infer({{m, k}, {k, n}, {m, n}})),
331 op.getIteratorTypes());
332 return success();
333 }
334 };
335
336 // Merge transpose op into the transfer read op. Transpose are not supported on
337 // MMA types but MMA load can transpose the matrix when loading.
338 struct CombineTransferReadOpTranspose final
339 : public OpRewritePattern<vector::TransposeOp> {
340 using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
341
matchAndRewrite__anon38edb3010811::CombineTransferReadOpTranspose342 LogicalResult matchAndRewrite(vector::TransposeOp op,
343 PatternRewriter &rewriter) const override {
344 auto transferReadOp =
345 op.getVector().getDefiningOp<vector::TransferReadOp>();
346 if (!transferReadOp)
347 return failure();
348
349 // TODO: support 0-d corner case.
350 if (transferReadOp.getTransferRank() == 0)
351 return failure();
352
353 if (transferReadOp.getMask() || transferReadOp.hasOutOfBoundsDim())
354 return failure();
355 SmallVector<int64_t, 2> perm;
356 op.getTransp(perm);
357 SmallVector<unsigned, 2> permU;
358 for (int64_t o : perm)
359 permU.push_back(unsigned(o));
360 AffineMap permutationMap =
361 AffineMap::getPermutationMap(permU, op.getContext());
362 AffineMap newMap =
363 permutationMap.compose(transferReadOp.getPermutationMap());
364 rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
365 op, op.getType(), transferReadOp.getSource(),
366 transferReadOp.getIndices(), AffineMapAttr::get(newMap),
367 transferReadOp.getPadding(), transferReadOp.getMask(),
368 transferReadOp.getInBoundsAttr());
369 return success();
370 }
371 };
372
373 } // namespace
374
375 // MMA types have different layout based on how they are used in matmul ops.
376 // Figure the right layout to use by looking at op uses.
377 // TODO: Change the GPU dialect to abstract the layout at the this level and
378 // only care about it during lowering to NVVM.
379 template <typename OpTy>
inferFragType(OpTy op)380 static const char *inferFragType(OpTy op) {
381 for (Operation *users : op->getUsers()) {
382 auto contract = dyn_cast<vector::ContractionOp>(users);
383 if (!contract)
384 continue;
385 if (contract.getLhs() == op.getResult())
386 return "AOp";
387 if (contract.getRhs() == op.getResult())
388 return "BOp";
389 }
390 return "COp";
391 }
392
convertTransferReadOp(vector::TransferReadOp op,llvm::DenseMap<Value,Value> & valueMapping)393 static void convertTransferReadOp(vector::TransferReadOp op,
394 llvm::DenseMap<Value, Value> &valueMapping) {
395 assert(op.getTransferRank() > 0 && "unexpected 0-d transfer");
396 assert(transferReadSupportsMMAMatrixType(op, /*useNvGpu=*/false));
397 Optional<int64_t> stride =
398 getMemrefConstantHorizontalStride(op.getShapedType());
399 AffineMap map = op.getPermutationMap();
400 // Handle broadcast by setting the stride to 0.
401 if (map.getResult(0).isa<AffineConstantExpr>()) {
402 assert(map.getResult(0).cast<AffineConstantExpr>().getValue() == 0);
403 stride = 0;
404 }
405 assert(stride);
406 const char *fragType = inferFragType(op);
407 gpu::MMAMatrixType type =
408 gpu::MMAMatrixType::get(op.getVectorType().getShape(),
409 op.getVectorType().getElementType(), fragType);
410 OpBuilder b(op);
411 Value load = b.create<gpu::SubgroupMmaLoadMatrixOp>(
412 op.getLoc(), type, op.getSource(), op.getIndices(),
413 b.getIndexAttr(*stride));
414 valueMapping[op.getResult()] = load;
415 }
416
convertTransferWriteOp(vector::TransferWriteOp op,llvm::DenseMap<Value,Value> & valueMapping)417 static void convertTransferWriteOp(vector::TransferWriteOp op,
418 llvm::DenseMap<Value, Value> &valueMapping) {
419 assert(transferWriteSupportsMMAMatrixType(op));
420 Optional<int64_t> stride =
421 getMemrefConstantHorizontalStride(op.getShapedType());
422 assert(stride);
423 OpBuilder b(op);
424 Value matrix = valueMapping.find(op.getVector())->second;
425 b.create<gpu::SubgroupMmaStoreMatrixOp>(op.getLoc(), matrix, op.getSource(),
426 op.getIndices(),
427 b.getIndexAttr(*stride));
428 op.erase();
429 }
430
431 /// Returns the vector type which represents a matrix fragment.
432 static VectorType
getMmaSyncVectorOperandType(const nvgpu::FragmentElementInfo & regInfo)433 getMmaSyncVectorOperandType(const nvgpu::FragmentElementInfo ®Info) {
434 SmallVector<int64_t> shape{regInfo.numRegistersPerFragment,
435 regInfo.elementsPerRegister};
436 Type elType = regInfo.registerLLVMType;
437 if (auto vecType = elType.dyn_cast<VectorType>())
438 elType = vecType.getElementType();
439 return VectorType::get(shape, elType);
440 }
441
442 /// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.
443 static LogicalResult
convertConstantOpMmaSync(arith::ConstantOp op,llvm::DenseMap<Value,Value> & valueMapping)444 convertConstantOpMmaSync(arith::ConstantOp op,
445 llvm::DenseMap<Value, Value> &valueMapping) {
446 OpBuilder b(op);
447 FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
448 nvgpu::getWarpMatrixInfo(op);
449 if (failed(warpMatrixInfo))
450 return failure();
451
452 FailureOr<nvgpu::FragmentElementInfo> regInfo =
453 nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
454 if (failed(regInfo))
455 return failure();
456
457 VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
458 auto dense = op.getValue().dyn_cast<SplatElementsAttr>();
459 if (!dense)
460 return failure();
461 Value result = b.create<arith::ConstantOp>(
462 op.getLoc(), vectorType,
463 DenseElementsAttr::get(vectorType, dense.getSplatValue<Attribute>()));
464 valueMapping[op.getResult()] = result;
465 return success();
466 }
467
468 static LogicalResult
creatLdMatrixCompatibleLoads(vector::TransferReadOp op,OpBuilder & builder,llvm::DenseMap<Value,Value> & valueMapping)469 creatLdMatrixCompatibleLoads(vector::TransferReadOp op, OpBuilder &builder,
470 llvm::DenseMap<Value, Value> &valueMapping) {
471 Location loc = op->getLoc();
472
473 FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
474 nvgpu::getWarpMatrixInfo(op);
475 if (failed(warpMatrixInfo))
476 return failure();
477
478 FailureOr<nvgpu::FragmentElementInfo> regInfo =
479 nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
480 if (failed(regInfo))
481 return failure();
482
483 FailureOr<nvgpu::LdMatrixParams> params = nvgpu::getLdMatrixParams(
484 *warpMatrixInfo,
485 /*transpose=*/!op.getPermutationMap().isMinorIdentity());
486 if (failed(params)) {
487 return op->emitError()
488 << "failed to convert vector.transfer_read to ldmatrix; this op "
489 "likely "
490 "should not be converted to a nvgpu.ldmatrix call.";
491 }
492
493 // Adjust the load offset.
494 auto laneId = builder.create<gpu::LaneIdOp>(loc);
495 FailureOr<AffineMap> offsets =
496 nvgpu::getLaneIdToLdMatrixMatrixCoord(loc, builder, *params);
497 if (failed(offsets))
498 return failure();
499
500 VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
501
502 SmallVector<Value, 4> indices;
503 getXferIndices<vector::TransferReadOp>(builder, op, *offsets, {laneId},
504 indices);
505 nvgpu::LdMatrixOp newOp = builder.create<nvgpu::LdMatrixOp>(
506 loc, vectorType, op.getSource(), indices,
507 !op.getPermutationMap().isMinorIdentity(), params->numTiles);
508 valueMapping[op] = newOp->getResult(0);
509 return success();
510 }
511
512 static LogicalResult
createNonLdMatrixLoads(vector::TransferReadOp op,OpBuilder & builder,llvm::DenseMap<Value,Value> & valueMapping)513 createNonLdMatrixLoads(vector::TransferReadOp op, OpBuilder &builder,
514 llvm::DenseMap<Value, Value> &valueMapping) {
515 Location loc = op.getLoc();
516 FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
517 nvgpu::getWarpMatrixInfo(op);
518 if (failed(warpMatrixInfo))
519 return failure();
520 FailureOr<nvgpu::FragmentElementInfo> regInfo =
521 nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
522 if (failed(regInfo)) {
523 op->emitError() << "Failed to deduce register fragment type during "
524 "conversion to distributed non-ldmatrix compatible load";
525 return failure();
526 }
527
528 Value laneId = builder.create<gpu::LaneIdOp>(loc);
529 SmallVector<Value, 4> elements;
530
531 // This is the individual element type.
532 Type loadedElType = regInfo->registerLLVMType;
533 VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
534
535 Value fill = builder.create<arith::ConstantOp>(
536 op.getLoc(), vectorType.getElementType(),
537 builder.getZeroAttr(vectorType.getElementType()));
538 Value result = builder.create<vector::SplatOp>(op.getLoc(), fill, vectorType);
539
540 bool isTransposeLoad = !op.getPermutationMap().isMinorIdentity();
541
542 // If we are not transposing, then we can use vectorized loads. Otherwise, we
543 // must load each element individually.
544 if (!isTransposeLoad) {
545 if (!loadedElType.isa<VectorType>()) {
546 loadedElType = VectorType::get({1}, loadedElType);
547 }
548
549 for (int i = 0; i < vectorType.getShape()[0]; i++) {
550 FailureOr<AffineMap> coords = nvgpu::getLaneIdAndValueIdToOperandCoord(
551 op.getLoc(), builder, *warpMatrixInfo);
552 if (failed(coords))
553 return failure();
554 Value logicalValueId = builder.create<arith::ConstantOp>(
555 loc, builder.getIndexType(),
556 builder.getIndexAttr(i * regInfo->elementsPerRegister));
557 SmallVector<Value, 4> newIndices;
558 getXferIndices<vector::TransferReadOp>(
559 builder, op, *coords, {laneId, logicalValueId}, newIndices);
560
561 Value el = builder.create<vector::LoadOp>(loc, loadedElType,
562 op.getSource(), newIndices);
563 result = builder.create<vector::InsertOp>(loc, el, result,
564 builder.getI64ArrayAttr(i));
565 }
566 } else {
567 if (auto vecType = loadedElType.dyn_cast<VectorType>()) {
568 loadedElType = vecType.getElementType();
569 }
570 for (int i = 0; i < vectorType.getShape()[0]; i++) {
571 for (unsigned innerIdx = 0; innerIdx < vectorType.getShape()[1];
572 innerIdx++) {
573
574 Value logicalValueId = builder.create<arith::ConstantOp>(
575 loc, builder.getIndexType(),
576 builder.getIndexAttr(i * regInfo->elementsPerRegister + innerIdx));
577 FailureOr<AffineMap> coords = nvgpu::getLaneIdAndValueIdToOperandCoord(
578 op.getLoc(), builder, *warpMatrixInfo);
579 if (failed(coords))
580 return failure();
581
582 SmallVector<Value, 4> newIndices;
583 getXferIndices<vector::TransferReadOp>(
584 builder, op, *coords, {laneId, logicalValueId}, newIndices);
585 Value el = builder.create<memref::LoadOp>(op.getLoc(), loadedElType,
586 op.getSource(), newIndices);
587 result = builder.create<vector::InsertOp>(
588 op.getLoc(), el, result, builder.getI64ArrayAttr({i, innerIdx}));
589 }
590 }
591 }
592
593 valueMapping[op.getResult()] = result;
594 return success();
595 }
596
597 /// Converts a `vector.transfer_read` operation directly to either a
598 /// `vector.load` or a `nvgpu.ldmatrix` operation. This function should only be
599 /// used when converting to `nvgpu.mma.sync` operations.
600 static LogicalResult
convertTransferReadToLoads(vector::TransferReadOp op,llvm::DenseMap<Value,Value> & valueMapping)601 convertTransferReadToLoads(vector::TransferReadOp op,
602 llvm::DenseMap<Value, Value> &valueMapping) {
603 OpBuilder b(op);
604
605 FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
606 nvgpu::getWarpMatrixInfo(op);
607 if (failed(warpMatrixInfo))
608 return failure();
609
610 bool isLdMatrixCompatible =
611 op.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt() == 3 &&
612 nvgpu::inferTileWidthInBits(*warpMatrixInfo) == 128;
613
614 VectorType vecTy = op.getVectorType();
615 int64_t bitWidth = vecTy.getElementType().getIntOrFloatBitWidth();
616
617 // When we are transposing the B operand, ldmatrix will only work if we have
618 // at least 8 rows to read and the width to read for the transpose is 128
619 // bits.
620 if (!op.getPermutationMap().isMinorIdentity() &&
621 (bitWidth != 16 || vecTy.getDimSize(1) < 8 ||
622 vecTy.getDimSize(0) * bitWidth < 128))
623 isLdMatrixCompatible = false;
624
625 if (!isLdMatrixCompatible)
626 return createNonLdMatrixLoads(op, b, valueMapping);
627
628 return creatLdMatrixCompatibleLoads(op, b, valueMapping);
629 }
630
631 static LogicalResult
convertTransferWriteToStores(vector::TransferWriteOp op,llvm::DenseMap<Value,Value> & valueMapping)632 convertTransferWriteToStores(vector::TransferWriteOp op,
633 llvm::DenseMap<Value, Value> &valueMapping) {
634 OpBuilder b(op);
635 Location loc = op->getLoc();
636 Value matrix = valueMapping.find(op.getVector())->second;
637
638 FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
639 nvgpu::getWarpMatrixInfo(op);
640 if (failed(warpMatrixInfo))
641 return failure();
642 FailureOr<nvgpu::FragmentElementInfo> regInfo =
643 nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
644 if (failed(regInfo))
645 return failure();
646
647 VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
648 Value laneId = b.create<gpu::LaneIdOp>(loc);
649
650 for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
651 Value logicalValueId = b.create<arith::ConstantOp>(
652 loc, b.getIndexType(),
653 b.getIndexAttr(i * regInfo->elementsPerRegister));
654 FailureOr<AffineMap> coords = nvgpu::getLaneIdAndValueIdToOperandCoord(
655 op.getLoc(), b, *warpMatrixInfo);
656 if (failed(coords))
657 return failure();
658
659 Value el = b.create<vector::ExtractOp>(loc, matrix, ArrayRef<int64_t>{i});
660 SmallVector<Value, 4> newIndices;
661 getXferIndices<vector::TransferWriteOp>(
662 b, op, *coords, {laneId, logicalValueId}, newIndices);
663 b.create<vector::StoreOp>(loc, el, op.getSource(), newIndices);
664 }
665 op->erase();
666 return success();
667 }
668
convertContractOp(vector::ContractionOp op,llvm::DenseMap<Value,Value> & valueMapping)669 static void convertContractOp(vector::ContractionOp op,
670 llvm::DenseMap<Value, Value> &valueMapping) {
671 OpBuilder b(op);
672 Value opA = valueMapping.find(op.getLhs())->second;
673 Value opB = valueMapping.find(op.getRhs())->second;
674 Value opC = valueMapping.find(op.getAcc())->second;
675 Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),
676 opA, opB, opC);
677 valueMapping[op.getResult()] = matmul;
678 }
679
680 static LogicalResult
convertContractOpToMmaSync(vector::ContractionOp op,llvm::DenseMap<Value,Value> & valueMapping)681 convertContractOpToMmaSync(vector::ContractionOp op,
682 llvm::DenseMap<Value, Value> &valueMapping) {
683 OpBuilder b(op);
684 Value opA = valueMapping.find(op.getLhs())->second;
685 Value opB = valueMapping.find(op.getRhs())->second;
686 Value opC = valueMapping.find(op.getAcc())->second;
687 int64_t m = op.getLhs().getType().cast<VectorType>().getShape()[0];
688 int64_t n = op.getRhs().getType().cast<VectorType>().getShape()[0];
689 int64_t k = op.getLhs().getType().cast<VectorType>().getShape()[1];
690 Value matmul = b.create<nvgpu::MmaSyncOp>(
691 op.getLoc(), opC.getType(), opA, opB, opC, b.getI64ArrayAttr({m, n, k}));
692 valueMapping[op.getResult()] = matmul;
693 return success();
694 }
695
696 /// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.
convertConstantOp(arith::ConstantOp op,llvm::DenseMap<Value,Value> & valueMapping)697 static void convertConstantOp(arith::ConstantOp op,
698 llvm::DenseMap<Value, Value> &valueMapping) {
699 assert(constantSupportsMMAMatrixType(op));
700 OpBuilder b(op);
701 Attribute splat =
702 op.getValue().cast<SplatElementsAttr>().getSplatValue<Attribute>();
703 auto scalarConstant =
704 b.create<arith::ConstantOp>(op.getLoc(), splat.getType(), splat);
705 const char *fragType = inferFragType(op);
706 auto vecType = op.getType().cast<VectorType>();
707 gpu::MMAMatrixType type = gpu::MMAMatrixType::get(
708 vecType.getShape(), vecType.getElementType(), llvm::StringRef(fragType));
709 auto matrix = b.create<gpu::SubgroupMmaConstantMatrixOp>(op.getLoc(), type,
710 scalarConstant);
711 valueMapping[op.getResult()] = matrix;
712 }
713
714 /// Convert a vector.broadcast from scalar to a SubgroupMmaConstantMatrix op.
convertBroadcastOp(vector::BroadcastOp op,llvm::DenseMap<Value,Value> & valueMapping)715 static void convertBroadcastOp(vector::BroadcastOp op,
716 llvm::DenseMap<Value, Value> &valueMapping) {
717 assert(broadcastSupportsMMAMatrixType(op));
718 OpBuilder b(op);
719 const char *fragType = inferFragType(op);
720 auto vecType = op.getVectorType();
721 gpu::MMAMatrixType type = gpu::MMAMatrixType::get(
722 vecType.getShape(), vecType.getElementType(), llvm::StringRef(fragType));
723 auto matrix = b.create<gpu::SubgroupMmaConstantMatrixOp>(op.getLoc(), type,
724 op.getSource());
725 valueMapping[op.getResult()] = matrix;
726 }
727
728 // Replace ForOp with a new ForOp with extra operands. The YieldOp is not
729 // updated and needs to be updated separatly for the loop to be correct.
replaceForOpWithNewSignature(OpBuilder & b,scf::ForOp loop,ValueRange newIterOperands)730 static scf::ForOp replaceForOpWithNewSignature(OpBuilder &b, scf::ForOp loop,
731 ValueRange newIterOperands) {
732 // Create a new loop before the existing one, with the extra operands.
733 OpBuilder::InsertionGuard g(b);
734 b.setInsertionPoint(loop);
735 auto operands = llvm::to_vector<4>(loop.getIterOperands());
736 operands.append(newIterOperands.begin(), newIterOperands.end());
737 scf::ForOp newLoop =
738 b.create<scf::ForOp>(loop.getLoc(), loop.getLowerBound(),
739 loop.getUpperBound(), loop.getStep(), operands);
740 newLoop.getBody()->erase();
741 newLoop.getLoopBody().getBlocks().splice(
742 newLoop.getLoopBody().getBlocks().begin(),
743 loop.getLoopBody().getBlocks());
744 for (Value operand : newIterOperands)
745 newLoop.getBody()->addArgument(operand.getType(), operand.getLoc());
746
747 for (auto it : llvm::zip(loop.getResults(), newLoop.getResults().take_front(
748 loop.getNumResults())))
749 std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
750 loop.erase();
751 return newLoop;
752 }
753
convertForOp(scf::ForOp op,llvm::DenseMap<Value,Value> & valueMapping)754 static void convertForOp(scf::ForOp op,
755 llvm::DenseMap<Value, Value> &valueMapping) {
756 SmallVector<Value> newOperands;
757 SmallVector<std::pair<size_t, size_t>> argMapping;
758 for (const auto &operand : llvm::enumerate(op.getIterOperands())) {
759 auto it = valueMapping.find(operand.value());
760 if (it == valueMapping.end())
761 continue;
762 argMapping.push_back(std::make_pair(
763 operand.index(), op.getNumIterOperands() + newOperands.size()));
764 newOperands.push_back(it->second);
765 }
766 OpBuilder b(op);
767 scf::ForOp newForOp = replaceForOpWithNewSignature(b, op, newOperands);
768 Block &loopBody = *newForOp.getBody();
769 for (auto mapping : argMapping) {
770 valueMapping[newForOp.getResult(mapping.first)] =
771 newForOp.getResult(mapping.second);
772 valueMapping[loopBody.getArgument(mapping.first +
773 newForOp.getNumInductionVars())] =
774 loopBody.getArgument(mapping.second + newForOp.getNumInductionVars());
775 }
776 }
777
convertYieldOp(scf::YieldOp op,llvm::DenseMap<Value,Value> & valueMapping)778 static void convertYieldOp(scf::YieldOp op,
779 llvm::DenseMap<Value, Value> &valueMapping) {
780 OpBuilder b(op);
781 auto loop = cast<scf::ForOp>(op->getParentOp());
782 auto yieldOperands = llvm::to_vector<4>(op.getOperands());
783 for (const auto &operand : llvm::enumerate(op.getOperands())) {
784 auto it = valueMapping.find(operand.value());
785 if (it == valueMapping.end())
786 continue;
787 // Replace the yield of old value with the for op argument to make it easier
788 // to remove the dead code.
789 yieldOperands[operand.index()] = loop.getIterOperands()[operand.index()];
790 yieldOperands.push_back(it->second);
791 }
792 b.create<scf::YieldOp>(op.getLoc(), yieldOperands);
793 op.erase();
794 }
795
796 /// Convert an elementwise op to the equivalent elementwise op on MMA matrix.
convertElementwiseOp(Operation * op,gpu::MMAElementwiseOp opType,llvm::DenseMap<Value,Value> & valueMapping)797 static void convertElementwiseOp(Operation *op, gpu::MMAElementwiseOp opType,
798 llvm::DenseMap<Value, Value> &valueMapping) {
799 OpBuilder b(op);
800 SmallVector<Value> matrixOperands;
801 for (Value operand : op->getOperands())
802 matrixOperands.push_back(valueMapping.find(operand)->second);
803 Value newOp = b.create<gpu::SubgroupMmaElementwiseOp>(
804 op->getLoc(), matrixOperands[0].getType(), matrixOperands, opType);
805 valueMapping[op->getResult(0)] = newOp;
806 }
807
populatePrepareVectorToMMAPatterns(RewritePatternSet & patterns,bool useNvGpu)808 void mlir::populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns,
809 bool useNvGpu) {
810 if (!useNvGpu) {
811 patterns.add<PrepareContractToGPUMMA, CombineTransferReadOpTranspose>(
812 patterns.getContext());
813 return;
814 }
815 patterns
816 .add<nvgpu::PrepareContractToGPUMMASync, CombineTransferReadOpTranspose>(
817 patterns.getContext());
818 }
819
convertVectorToMMAOps(Operation * rootOp)820 void mlir::convertVectorToMMAOps(Operation *rootOp) {
821 SetVector<Operation *> ops = getOpToConvert(rootOp, /*useNvGpu=*/false);
822 llvm::DenseMap<Value, Value> valueMapping;
823 for (Operation *op : ops) {
824 if (auto transferRead = dyn_cast<vector::TransferReadOp>(op)) {
825 convertTransferReadOp(transferRead, valueMapping);
826 } else if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op)) {
827 convertTransferWriteOp(transferWrite, valueMapping);
828 } else if (auto contractOp = dyn_cast<vector::ContractionOp>(op)) {
829 convertContractOp(contractOp, valueMapping);
830 } else if (auto constantOp = dyn_cast<arith::ConstantOp>(op)) {
831 convertConstantOp(constantOp, valueMapping);
832 } else if (auto broadcastOp = dyn_cast<vector::BroadcastOp>(op)) {
833 convertBroadcastOp(broadcastOp, valueMapping);
834 } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
835 convertForOp(forOp, valueMapping);
836 } else if (auto yiledOp = dyn_cast<scf::YieldOp>(op)) {
837 convertYieldOp(yiledOp, valueMapping);
838 } else if (auto elementwiseType = convertElementwiseOpToMMA(op)) {
839 convertElementwiseOp(op, *elementwiseType, valueMapping);
840 }
841 }
842 }
843
convertVectorToNVVMCompatibleMMASync(Operation * rootOp)844 LogicalResult mlir::convertVectorToNVVMCompatibleMMASync(Operation *rootOp) {
845 SetVector<Operation *> ops = getOpToConvert(rootOp, /*useNvGpu=*/true);
846 llvm::DenseMap<Value, Value> valueMapping;
847 for (Operation *op : ops) {
848 if (llvm::TypeSwitch<Operation *, LogicalResult>(op)
849 .Case([&](vector::TransferReadOp transferReadOp) {
850 return convertTransferReadToLoads(transferReadOp, valueMapping);
851 })
852 .Case([&](vector::TransferWriteOp transferWriteOp) {
853 return convertTransferWriteToStores(transferWriteOp,
854 valueMapping);
855 })
856 .Case([&](vector::ContractionOp contractionOp) {
857 return convertContractOpToMmaSync(contractionOp, valueMapping);
858 })
859 .Case([&](scf::ForOp forOp) {
860 convertForOp(forOp, valueMapping);
861 return success();
862 })
863 .Case([&](scf::YieldOp yieldOp) {
864 convertYieldOp(yieldOp, valueMapping);
865 return success();
866 })
867 .Case([&](arith::ConstantOp constOp) {
868 return convertConstantOpMmaSync(constOp, valueMapping);
869 })
870 .Default([&](Operation *op) {
871 op->emitError() << "unhandled vector to mma type: " << *op;
872 return failure();
873 })
874 .failed()) {
875 op->emitError() << "Failed to convert op " << *op;
876 return failure();
877 }
878 }
879 return success();
880 }
881
882 namespace {
883
884 struct ConvertVectorToGPUPass
885 : public ConvertVectorToGPUBase<ConvertVectorToGPUPass> {
886
ConvertVectorToGPUPass__anon38edb3011111::ConvertVectorToGPUPass887 explicit ConvertVectorToGPUPass(bool useNvGpu_) {
888 useNvGpu.setValue(useNvGpu_);
889 }
890
runOnOperation__anon38edb3011111::ConvertVectorToGPUPass891 void runOnOperation() override {
892 RewritePatternSet patterns(&getContext());
893 populatePrepareVectorToMMAPatterns(patterns, useNvGpu.getValue());
894 if (failed(
895 applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
896 return signalPassFailure();
897
898 if (useNvGpu.getValue()) {
899 if (failed(convertVectorToNVVMCompatibleMMASync(getOperation())))
900 return signalPassFailure();
901 }
902
903 (void)convertVectorToMMAOps(getOperation());
904 }
905 };
906
907 } // namespace
908
createConvertVectorToGPUPass(bool useNvGpu)909 std::unique_ptr<Pass> mlir::createConvertVectorToGPUPass(bool useNvGpu) {
910 return std::make_unique<ConvertVectorToGPUPass>(useNvGpu);
911 }
912