X86Vector/Transforms/AVXTranspose.cpp

34ff8573SNicolas Vasilache//===- AVXTranspose.cpp - Lower Vector transpose to AVX -------------------===//
34ff8573SNicolas Vasilache//
34ff8573SNicolas Vasilache// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
34ff8573SNicolas Vasilache// See https://llvm.org/LICENSE.txt for license information.
34ff8573SNicolas Vasilache// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
34ff8573SNicolas Vasilache//
34ff8573SNicolas Vasilache//===----------------------------------------------------------------------===//
34ff8573SNicolas Vasilache//
34ff8573SNicolas Vasilache// This file implements vector.transpose rewrites as AVX patterns for particular
34ff8573SNicolas Vasilache// sizes of interest.
34ff8573SNicolas Vasilache//
34ff8573SNicolas Vasilache//===----------------------------------------------------------------------===//
34ff8573SNicolas Vasilache
*eda6f907SRiver Riddle#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
b2729fdaSNicolas Vasilache#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
99ef9eebSMatthias Springer#include "mlir/Dialect/Vector/IR/VectorOps.h"
34ff8573SNicolas Vasilache#include "mlir/Dialect/X86Vector/Transforms.h"
34ff8573SNicolas Vasilache#include "mlir/IR/ImplicitLocOpBuilder.h"
34ff8573SNicolas Vasilache#include "mlir/IR/Matchers.h"
34ff8573SNicolas Vasilache#include "mlir/IR/PatternMatch.h"
b2729fdaSNicolas Vasilache#include "llvm/Support/Format.h"
b2729fdaSNicolas Vasilache#include "llvm/Support/FormatVariadic.h"
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilacheusing namespace mlir;
34ff8573SNicolas Vasilacheusing namespace mlir::vector;
34ff8573SNicolas Vasilacheusing namespace mlir::x86vector;
34ff8573SNicolas Vasilacheusing namespace mlir::x86vector::avx2;
b2729fdaSNicolas Vasilacheusing namespace mlir::x86vector::avx2::inline_asm;
b2729fdaSNicolas Vasilacheusing namespace mlir::x86vector::avx2::intrin;
34ff8573SNicolas Vasilache
b2729fdaSNicolas VasilacheValue mlir::x86vector::avx2::inline_asm::mm256BlendPsAsm(
b2729fdaSNicolas Vasilache    ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask) {
b2729fdaSNicolas Vasilache  auto asmDialectAttr =
b2729fdaSNicolas Vasilache      LLVM::AsmDialectAttr::get(b.getContext(), LLVM::AsmDialect::AD_Intel);
02b6fb21SMehdi Amini  const auto *asmTp = "vblendps $0, $1, $2, {0}";
02b6fb21SMehdi Amini  const auto *asmCstr =
02b6fb21SMehdi Amini      "=x,x,x"; // Careful: constraint parser is very brittle: no ws!
b2729fdaSNicolas Vasilache  SmallVector<Value> asmVals{v1, v2};
b2729fdaSNicolas Vasilache  auto asmStr = llvm::formatv(asmTp, llvm::format_hex(mask, /*width=*/2)).str();
b2729fdaSNicolas Vasilache  auto asmOp = b.create<LLVM::InlineAsmOp>(
42398b51SNicolas Vasilache      v1.getType(), /*operands=*/asmVals, /*asm_string=*/asmStr,
42398b51SNicolas Vasilache      /*constraints=*/asmCstr, /*has_side_effects=*/false,
42398b51SNicolas Vasilache      /*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
42398b51SNicolas Vasilache      /*operand_attrs=*/ArrayAttr());
b2729fdaSNicolas Vasilache  return asmOp.getResult(0);
b2729fdaSNicolas Vasilache}
b2729fdaSNicolas Vasilache
b2729fdaSNicolas VasilacheValue mlir::x86vector::avx2::intrin::mm256UnpackLoPs(ImplicitLocOpBuilder &b,
b2729fdaSNicolas Vasilache                                                     Value v1, Value v2) {
34ff8573SNicolas Vasilache  return b.create<vector::ShuffleOp>(
34ff8573SNicolas Vasilache      v1, v2, ArrayRef<int64_t>{0, 8, 1, 9, 4, 12, 5, 13});
34ff8573SNicolas Vasilache}
34ff8573SNicolas Vasilache
b2729fdaSNicolas VasilacheValue mlir::x86vector::avx2::intrin::mm256UnpackHiPs(ImplicitLocOpBuilder &b,
b2729fdaSNicolas Vasilache                                                     Value v1, Value v2) {
34ff8573SNicolas Vasilache  return b.create<vector::ShuffleOp>(
34ff8573SNicolas Vasilache      v1, v2, ArrayRef<int64_t>{2, 10, 3, 11, 6, 14, 7, 15});
34ff8573SNicolas Vasilache}
34ff8573SNicolas Vasilache///                            a  a   b   b  a  a   b   b
34ff8573SNicolas Vasilache/// Takes an 8 bit mask, 2 bit for each position of a[0, 3)  **and** b[0, 4):
34ff8573SNicolas Vasilache///                                 0:127    |         128:255
34ff8573SNicolas Vasilache///                            b01  b23  C8  D8  |  b01+4 b23+4 C8+4 D8+4
b2729fdaSNicolas VasilacheValue mlir::x86vector::avx2::intrin::mm256ShufflePs(ImplicitLocOpBuilder &b,
b2729fdaSNicolas Vasilache                                                    Value v1, Value v2,
b2729fdaSNicolas Vasilache                                                    uint8_t mask) {
b2729fdaSNicolas Vasilache  uint8_t b01, b23, b45, b67;
34ff8573SNicolas Vasilache  MaskHelper::extractShuffle(mask, b01, b23, b45, b67);
34ff8573SNicolas Vasilache  SmallVector<int64_t> shuffleMask{b01,     b23,     b45 + 8,     b67 + 8,
34ff8573SNicolas Vasilache                                   b01 + 4, b23 + 4, b45 + 8 + 4, b67 + 8 + 4};
34ff8573SNicolas Vasilache  return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
34ff8573SNicolas Vasilache}
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache// imm[0:1] out of imm[0:3] is:
34ff8573SNicolas Vasilache//    0             1           2             3
34ff8573SNicolas Vasilache// a[0:127] or a[128:255] or b[0:127] or b[128:255]    |
34ff8573SNicolas Vasilache//          a[0:127] or a[128:255] or b[0:127] or b[128:255]
34ff8573SNicolas Vasilache//             0             1           2             3
34ff8573SNicolas Vasilache// imm[0:1] out of imm[4:7].
b2729fdaSNicolas VasilacheValue mlir::x86vector::avx2::intrin::mm256Permute2f128Ps(
b2729fdaSNicolas Vasilache    ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask) {
34ff8573SNicolas Vasilache  SmallVector<int64_t> shuffleMask;
b2729fdaSNicolas Vasilache  auto appendToMask = [&](uint8_t control) {
34ff8573SNicolas Vasilache    if (control == 0)
34ff8573SNicolas Vasilache      llvm::append_range(shuffleMask, ArrayRef<int64_t>{0, 1, 2, 3});
34ff8573SNicolas Vasilache    else if (control == 1)
34ff8573SNicolas Vasilache      llvm::append_range(shuffleMask, ArrayRef<int64_t>{4, 5, 6, 7});
34ff8573SNicolas Vasilache    else if (control == 2)
34ff8573SNicolas Vasilache      llvm::append_range(shuffleMask, ArrayRef<int64_t>{8, 9, 10, 11});
34ff8573SNicolas Vasilache    else if (control == 3)
34ff8573SNicolas Vasilache      llvm::append_range(shuffleMask, ArrayRef<int64_t>{12, 13, 14, 15});
34ff8573SNicolas Vasilache    else
34ff8573SNicolas Vasilache      llvm_unreachable("control > 3 : overflow");
34ff8573SNicolas Vasilache  };
b2729fdaSNicolas Vasilache  uint8_t b03, b47;
34ff8573SNicolas Vasilache  MaskHelper::extractPermute(mask, b03, b47);
34ff8573SNicolas Vasilache  appendToMask(b03);
34ff8573SNicolas Vasilache  appendToMask(b47);
34ff8573SNicolas Vasilache  return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
34ff8573SNicolas Vasilache}
34ff8573SNicolas Vasilache
b2729fdaSNicolas Vasilache/// If bit i of `mask` is zero, take f32@i from v1 else take it from v2.
b2729fdaSNicolas VasilacheValue mlir::x86vector::avx2::intrin::mm256BlendPs(ImplicitLocOpBuilder &b,
b2729fdaSNicolas Vasilache                                                  Value v1, Value v2,
b2729fdaSNicolas Vasilache                                                  uint8_t mask) {
b2729fdaSNicolas Vasilache  SmallVector<int64_t, 8> shuffleMask;
b2729fdaSNicolas Vasilache  for (int i = 0; i < 8; ++i) {
b2729fdaSNicolas Vasilache    bool isSet = mask & (1 << i);
b2729fdaSNicolas Vasilache    shuffleMask.push_back(!isSet ? i : i + 8);
b2729fdaSNicolas Vasilache  }
b2729fdaSNicolas Vasilache  return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
b2729fdaSNicolas Vasilache}
b2729fdaSNicolas Vasilache
34ff8573SNicolas Vasilache/// AVX2 4x8xf32-specific transpose lowering using a "C intrinsics" model.
34ff8573SNicolas Vasilachevoid mlir::x86vector::avx2::transpose4x8xf32(ImplicitLocOpBuilder &ib,
34ff8573SNicolas Vasilache                                             MutableArrayRef<Value> vs) {
34ff8573SNicolas Vasilache#ifndef NDEBUG
f04a1237SBenjamin Kramer  auto vt = VectorType::get({8}, Float32Type::get(ib.getContext()));
34ff8573SNicolas Vasilache  assert(vs.size() == 4 && "expects 4 vectors");
34ff8573SNicolas Vasilache  assert(llvm::all_of(ValueRange{vs}.getTypes(),
34ff8573SNicolas Vasilache                      [&](Type t) { return t == vt; }) &&
34ff8573SNicolas Vasilache         "expects all types to be vector<8xf32>");
34ff8573SNicolas Vasilache#endif
34ff8573SNicolas Vasilache
02b6fb21SMehdi Amini  Value t0 = mm256UnpackLoPs(ib, vs[0], vs[1]);
02b6fb21SMehdi Amini  Value t1 = mm256UnpackHiPs(ib, vs[0], vs[1]);
02b6fb21SMehdi Amini  Value t2 = mm256UnpackLoPs(ib, vs[2], vs[3]);
02b6fb21SMehdi Amini  Value t3 = mm256UnpackHiPs(ib, vs[2], vs[3]);
02b6fb21SMehdi Amini  Value s0 = mm256ShufflePs(ib, t0, t2, MaskHelper::shuffle<1, 0, 1, 0>());
02b6fb21SMehdi Amini  Value s1 = mm256ShufflePs(ib, t0, t2, MaskHelper::shuffle<3, 2, 3, 2>());
02b6fb21SMehdi Amini  Value s2 = mm256ShufflePs(ib, t1, t3, MaskHelper::shuffle<1, 0, 1, 0>());
02b6fb21SMehdi Amini  Value s3 = mm256ShufflePs(ib, t1, t3, MaskHelper::shuffle<3, 2, 3, 2>());
02b6fb21SMehdi Amini  vs[0] = mm256Permute2f128Ps(ib, s0, s1, MaskHelper::permute<2, 0>());
02b6fb21SMehdi Amini  vs[1] = mm256Permute2f128Ps(ib, s2, s3, MaskHelper::permute<2, 0>());
02b6fb21SMehdi Amini  vs[2] = mm256Permute2f128Ps(ib, s0, s1, MaskHelper::permute<3, 1>());
02b6fb21SMehdi Amini  vs[3] = mm256Permute2f128Ps(ib, s2, s3, MaskHelper::permute<3, 1>());
34ff8573SNicolas Vasilache}
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache/// AVX2 8x8xf32-specific transpose lowering using a "C intrinsics" model.
34ff8573SNicolas Vasilachevoid mlir::x86vector::avx2::transpose8x8xf32(ImplicitLocOpBuilder &ib,
34ff8573SNicolas Vasilache                                             MutableArrayRef<Value> vs) {
34ff8573SNicolas Vasilache  auto vt = VectorType::get({8}, Float32Type::get(ib.getContext()));
34ff8573SNicolas Vasilache  (void)vt;
34ff8573SNicolas Vasilache  assert(vs.size() == 8 && "expects 8 vectors");
34ff8573SNicolas Vasilache  assert(llvm::all_of(ValueRange{vs}.getTypes(),
34ff8573SNicolas Vasilache                      [&](Type t) { return t == vt; }) &&
34ff8573SNicolas Vasilache         "expects all types to be vector<8xf32>");
34ff8573SNicolas Vasilache
02b6fb21SMehdi Amini  Value t0 = mm256UnpackLoPs(ib, vs[0], vs[1]);
02b6fb21SMehdi Amini  Value t1 = mm256UnpackHiPs(ib, vs[0], vs[1]);
02b6fb21SMehdi Amini  Value t2 = mm256UnpackLoPs(ib, vs[2], vs[3]);
02b6fb21SMehdi Amini  Value t3 = mm256UnpackHiPs(ib, vs[2], vs[3]);
02b6fb21SMehdi Amini  Value t4 = mm256UnpackLoPs(ib, vs[4], vs[5]);
02b6fb21SMehdi Amini  Value t5 = mm256UnpackHiPs(ib, vs[4], vs[5]);
02b6fb21SMehdi Amini  Value t6 = mm256UnpackLoPs(ib, vs[6], vs[7]);
02b6fb21SMehdi Amini  Value t7 = mm256UnpackHiPs(ib, vs[6], vs[7]);
b2729fdaSNicolas Vasilache
b2729fdaSNicolas Vasilache  using inline_asm::mm256BlendPsAsm;
02b6fb21SMehdi Amini  Value sh0 = mm256ShufflePs(ib, t0, t2, MaskHelper::shuffle<1, 0, 3, 2>());
02b6fb21SMehdi Amini  Value sh2 = mm256ShufflePs(ib, t1, t3, MaskHelper::shuffle<1, 0, 3, 2>());
02b6fb21SMehdi Amini  Value sh4 = mm256ShufflePs(ib, t4, t6, MaskHelper::shuffle<1, 0, 3, 2>());
02b6fb21SMehdi Amini  Value sh6 = mm256ShufflePs(ib, t5, t7, MaskHelper::shuffle<1, 0, 3, 2>());
b2729fdaSNicolas Vasilache
02b6fb21SMehdi Amini  Value s0 =
02b6fb21SMehdi Amini      mm256BlendPsAsm(ib, t0, sh0, MaskHelper::blend<0, 0, 1, 1, 0, 0, 1, 1>());
02b6fb21SMehdi Amini  Value s1 =
02b6fb21SMehdi Amini      mm256BlendPsAsm(ib, t2, sh0, MaskHelper::blend<1, 1, 0, 0, 1, 1, 0, 0>());
02b6fb21SMehdi Amini  Value s2 =
02b6fb21SMehdi Amini      mm256BlendPsAsm(ib, t1, sh2, MaskHelper::blend<0, 0, 1, 1, 0, 0, 1, 1>());
02b6fb21SMehdi Amini  Value s3 =
02b6fb21SMehdi Amini      mm256BlendPsAsm(ib, t3, sh2, MaskHelper::blend<1, 1, 0, 0, 1, 1, 0, 0>());
02b6fb21SMehdi Amini  Value s4 =
02b6fb21SMehdi Amini      mm256BlendPsAsm(ib, t4, sh4, MaskHelper::blend<0, 0, 1, 1, 0, 0, 1, 1>());
02b6fb21SMehdi Amini  Value s5 =
02b6fb21SMehdi Amini      mm256BlendPsAsm(ib, t6, sh4, MaskHelper::blend<1, 1, 0, 0, 1, 1, 0, 0>());
02b6fb21SMehdi Amini  Value s6 =
02b6fb21SMehdi Amini      mm256BlendPsAsm(ib, t5, sh6, MaskHelper::blend<0, 0, 1, 1, 0, 0, 1, 1>());
02b6fb21SMehdi Amini  Value s7 =
02b6fb21SMehdi Amini      mm256BlendPsAsm(ib, t7, sh6, MaskHelper::blend<1, 1, 0, 0, 1, 1, 0, 0>());
b2729fdaSNicolas Vasilache
02b6fb21SMehdi Amini  vs[0] = mm256Permute2f128Ps(ib, s0, s4, MaskHelper::permute<2, 0>());
02b6fb21SMehdi Amini  vs[1] = mm256Permute2f128Ps(ib, s1, s5, MaskHelper::permute<2, 0>());
02b6fb21SMehdi Amini  vs[2] = mm256Permute2f128Ps(ib, s2, s6, MaskHelper::permute<2, 0>());
02b6fb21SMehdi Amini  vs[3] = mm256Permute2f128Ps(ib, s3, s7, MaskHelper::permute<2, 0>());
02b6fb21SMehdi Amini  vs[4] = mm256Permute2f128Ps(ib, s0, s4, MaskHelper::permute<3, 1>());
02b6fb21SMehdi Amini  vs[5] = mm256Permute2f128Ps(ib, s1, s5, MaskHelper::permute<3, 1>());
02b6fb21SMehdi Amini  vs[6] = mm256Permute2f128Ps(ib, s2, s6, MaskHelper::permute<3, 1>());
02b6fb21SMehdi Amini  vs[7] = mm256Permute2f128Ps(ib, s3, s7, MaskHelper::permute<3, 1>());
34ff8573SNicolas Vasilache}
34ff8573SNicolas Vasilache
d7e0a084SDiego Caballero/// Given the n-D transpose pattern 'transp', return true if 'dim0' and 'dim1'
d7e0a084SDiego Caballero/// should be transposed with each other within the context of their 2D
d7e0a084SDiego Caballero/// transposition slice.
d7e0a084SDiego Caballero///
d7e0a084SDiego Caballero/// Example 1: dim0 = 0, dim1 = 2, transp = [2, 1, 0]
d7e0a084SDiego Caballero///   Return true: dim0 and dim1 are transposed within the context of their 2D
d7e0a084SDiego Caballero///   transposition slice ([1, 0]).
d7e0a084SDiego Caballero///
d7e0a084SDiego Caballero/// Example 2: dim0 = 0, dim1 = 1, transp = [2, 1, 0]
d7e0a084SDiego Caballero///   Return true: dim0 and dim1 are transposed within the context of their 2D
d7e0a084SDiego Caballero///   transposition slice ([1, 0]). Paradoxically, note how dim1 (1) is *not*
d7e0a084SDiego Caballero///   transposed within the full context of the transposition.
d7e0a084SDiego Caballero///
d7e0a084SDiego Caballero/// Example 3: dim0 = 0, dim1 = 1, transp = [2, 0, 1]
d7e0a084SDiego Caballero///   Return false: dim0 and dim1 are *not* transposed within the context of
d7e0a084SDiego Caballero///   their 2D transposition slice ([0, 1]). Paradoxically, note how dim0 (0)
d7e0a084SDiego Caballero///   and dim1 (1) are transposed within the full context of the of the
d7e0a084SDiego Caballero///   transposition.
d7e0a084SDiego Caballerostatic bool areDimsTransposedIn2DSlice(int64_t dim0, int64_t dim1,
d7e0a084SDiego Caballero                                       ArrayRef<int64_t> transp) {
d7e0a084SDiego Caballero  // Perform a linear scan along the dimensions of the transposed pattern. If
d7e0a084SDiego Caballero  // dim0 is found first, dim0 and dim1 are not transposed within the context of
d7e0a084SDiego Caballero  // their 2D slice. Otherwise, 'dim1' is found first and they are transposed.
d7e0a084SDiego Caballero  for (int64_t permDim : transp) {
d7e0a084SDiego Caballero    if (permDim == dim0)
d7e0a084SDiego Caballero      return false;
d7e0a084SDiego Caballero    if (permDim == dim1)
d7e0a084SDiego Caballero      return true;
d7e0a084SDiego Caballero  }
d7e0a084SDiego Caballero
d7e0a084SDiego Caballero  llvm_unreachable("Ill-formed transpose pattern");
d7e0a084SDiego Caballero}
d7e0a084SDiego Caballero
d7e0a084SDiego Caballero/// Rewrite AVX2-specific vector.transpose, for the supported cases and
d7e0a084SDiego Caballero/// depending on the `TransposeLoweringOptions`. The lowering supports 2-D
d7e0a084SDiego Caballero/// transpose cases and n-D cases that have been decomposed into 2-D
d7e0a084SDiego Caballero/// transposition slices. For example, a 3-D transpose:
d7e0a084SDiego Caballero///
d7e0a084SDiego Caballero///   %0 = vector.transpose %arg0, [2, 0, 1]
d7e0a084SDiego Caballero///      : vector<1024x2048x4096xf32> to vector<4096x1024x2048xf32>
d7e0a084SDiego Caballero///
d7e0a084SDiego Caballero/// could be sliced into 2-D transposes by tiling two of its dimensions to one
d7e0a084SDiego Caballero/// of the vector lengths supported by the AVX2 patterns (e.g., 4x8):
d7e0a084SDiego Caballero///
d7e0a084SDiego Caballero///   %0 = vector.transpose %arg0, [2, 0, 1]
d7e0a084SDiego Caballero///      : vector<1x4x8xf32> to vector<8x1x4xf32>
d7e0a084SDiego Caballero///
d7e0a084SDiego Caballero/// This lowering will analyze the n-D vector.transpose and determine if it's a
d7e0a084SDiego Caballero/// supported 2-D transposition slice where any of the AVX2 patterns can be
d7e0a084SDiego Caballero/// applied.
34ff8573SNicolas Vasilacheclass TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
34ff8573SNicolas Vasilachepublic:
34ff8573SNicolas Vasilache  using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache  TransposeOpLowering(LoweringOptions loweringOptions, MLIRContext *context,
34ff8573SNicolas Vasilache                      int benefit)
34ff8573SNicolas Vasilache      : OpRewritePattern<vector::TransposeOp>(context, benefit),
34ff8573SNicolas Vasilache        loweringOptions(loweringOptions) {}
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache  LogicalResult matchAndRewrite(vector::TransposeOp op,
34ff8573SNicolas Vasilache                                PatternRewriter &rewriter) const override {
34ff8573SNicolas Vasilache    auto loc = op.getLoc();
34ff8573SNicolas Vasilache
d7e0a084SDiego Caballero    // Check if the source vector type is supported. AVX2 patterns can only be
875bbce9SDiego Caballero    // applied to f32 vector types with two dimensions greater than one.
34ff8573SNicolas Vasilache    VectorType srcType = op.getVectorType();
875bbce9SDiego Caballero    if (!srcType.getElementType().isF32())
875bbce9SDiego Caballero      return rewriter.notifyMatchFailure(op, "Unsupported vector element type");
875bbce9SDiego Caballero
d7e0a084SDiego Caballero    SmallVector<int64_t> srcGtOneDims;
d7e0a084SDiego Caballero    for (auto &en : llvm::enumerate(srcType.getShape()))
d7e0a084SDiego Caballero      if (en.value() > 1)
d7e0a084SDiego Caballero        srcGtOneDims.push_back(en.index());
d7e0a084SDiego Caballero
d7e0a084SDiego Caballero    if (srcGtOneDims.size() != 2)
d7e0a084SDiego Caballero      return rewriter.notifyMatchFailure(op, "Unsupported vector type");
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache    SmallVector<int64_t, 4> transp;
7c38fd60SJacques Pienaar    for (auto attr : op.getTransp())
34ff8573SNicolas Vasilache      transp.push_back(attr.cast<IntegerAttr>().getInt());
34ff8573SNicolas Vasilache
d7e0a084SDiego Caballero    // Check whether the two source vector dimensions that are greater than one
d7e0a084SDiego Caballero    // must be transposed with each other so that we can apply one of the 2-D
d7e0a084SDiego Caballero    // AVX2 transpose pattens. Otherwise, these patterns are not applicable.
d7e0a084SDiego Caballero    if (!areDimsTransposedIn2DSlice(srcGtOneDims[0], srcGtOneDims[1], transp))
d7e0a084SDiego Caballero      return rewriter.notifyMatchFailure(
d7e0a084SDiego Caballero          op, "Not applicable to this transpose permutation");
d7e0a084SDiego Caballero
d7e0a084SDiego Caballero    // Retrieve the sizes of the two dimensions greater than one to be
d7e0a084SDiego Caballero    // transposed.
d7e0a084SDiego Caballero    auto srcShape = srcType.getShape();
d7e0a084SDiego Caballero    int64_t m = srcShape[srcGtOneDims[0]], n = srcShape[srcGtOneDims[1]];
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache    auto applyRewrite = [&]() {
34ff8573SNicolas Vasilache      ImplicitLocOpBuilder ib(loc, rewriter);
34ff8573SNicolas Vasilache      SmallVector<Value> vs;
d7e0a084SDiego Caballero
d7e0a084SDiego Caballero      // Reshape the n-D input vector with only two dimensions greater than one
d7e0a084SDiego Caballero      // to a 2-D vector.
d7e0a084SDiego Caballero      auto flattenedType =
d7e0a084SDiego Caballero          VectorType::get({n * m}, op.getVectorType().getElementType());
d7e0a084SDiego Caballero      auto reshInputType = VectorType::get({m, n}, srcType.getElementType());
d7e0a084SDiego Caballero      auto reshInput =
7c38fd60SJacques Pienaar          ib.create<vector::ShapeCastOp>(flattenedType, op.getVector());
d7e0a084SDiego Caballero      reshInput = ib.create<vector::ShapeCastOp>(reshInputType, reshInput);
d7e0a084SDiego Caballero
d7e0a084SDiego Caballero      // Extract 1-D vectors from the higher-order dimension of the input
d7e0a084SDiego Caballero      // vector.
34ff8573SNicolas Vasilache      for (int64_t i = 0; i < m; ++i)
d7e0a084SDiego Caballero        vs.push_back(ib.create<vector::ExtractOp>(reshInput, i));
d7e0a084SDiego Caballero
d7e0a084SDiego Caballero      // Transpose set of 1-D vectors.
34ff8573SNicolas Vasilache      if (m == 4)
34ff8573SNicolas Vasilache        transpose4x8xf32(ib, vs);
34ff8573SNicolas Vasilache      if (m == 8)
34ff8573SNicolas Vasilache        transpose8x8xf32(ib, vs);
d7e0a084SDiego Caballero
d7e0a084SDiego Caballero      // Insert transposed 1-D vectors into the higher-order dimension of the
d7e0a084SDiego Caballero      // output vector.
d7e0a084SDiego Caballero      Value res = ib.create<arith::ConstantOp>(reshInputType,
d7e0a084SDiego Caballero                                               ib.getZeroAttr(reshInputType));
34ff8573SNicolas Vasilache      for (int64_t i = 0; i < m; ++i)
34ff8573SNicolas Vasilache        res = ib.create<vector::InsertOp>(vs[i], res, i);
34ff8573SNicolas Vasilache
d7e0a084SDiego Caballero      // The output vector still has the shape of the input vector (e.g., 4x8).
d7e0a084SDiego Caballero      // We have to transpose their dimensions and retrieve its original rank
d7e0a084SDiego Caballero      // (e.g., 1x8x1x4x1).
d7e0a084SDiego Caballero      res = ib.create<vector::ShapeCastOp>(flattenedType, res);
d7e0a084SDiego Caballero      res = ib.create<vector::ShapeCastOp>(op.getResultType(), res);
34ff8573SNicolas Vasilache      rewriter.replaceOp(op, res);
34ff8573SNicolas Vasilache      return success();
34ff8573SNicolas Vasilache    };
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache    if (loweringOptions.transposeOptions.lower4x8xf32_ && m == 4 && n == 8)
34ff8573SNicolas Vasilache      return applyRewrite();
34ff8573SNicolas Vasilache    if (loweringOptions.transposeOptions.lower8x8xf32_ && m == 8 && n == 8)
34ff8573SNicolas Vasilache      return applyRewrite();
34ff8573SNicolas Vasilache    return failure();
34ff8573SNicolas Vasilache  }
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilacheprivate:
34ff8573SNicolas Vasilache  LoweringOptions loweringOptions;
34ff8573SNicolas Vasilache};
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilachevoid mlir::x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
34ff8573SNicolas Vasilache    RewritePatternSet &patterns, LoweringOptions options, int benefit) {
34ff8573SNicolas Vasilache  patterns.add<TransposeOpLowering>(options, patterns.getContext(), benefit);
34ff8573SNicolas Vasilache}