1f1933329SEugene Zelenko //===- InterleavedAccessPass.cpp ------------------------------------------===//
21c1e0c9eSHao Liu //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
61c1e0c9eSHao Liu //
71c1e0c9eSHao Liu //===----------------------------------------------------------------------===//
81c1e0c9eSHao Liu //
91c1e0c9eSHao Liu // This file implements the Interleaved Access pass, which identifies
10a306eeb2SChad Rosier // interleaved memory accesses and transforms them into target specific
11a306eeb2SChad Rosier // intrinsics.
121c1e0c9eSHao Liu //
131c1e0c9eSHao Liu // An interleaved load reads data from memory into several vectors, with
141c1e0c9eSHao Liu // DE-interleaving the data on a factor. An interleaved store writes several
151c1e0c9eSHao Liu // vectors to memory with RE-interleaving the data on a factor.
161c1e0c9eSHao Liu //
17a306eeb2SChad Rosier // As interleaved accesses are difficult to identified in CodeGen (mainly
18a306eeb2SChad Rosier // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
19a306eeb2SChad Rosier // IR), we identify and transform them to intrinsics in this pass so the
20a306eeb2SChad Rosier // intrinsics can be easily matched into target specific instructions later in
21a306eeb2SChad Rosier // CodeGen.
221c1e0c9eSHao Liu //
231c1e0c9eSHao Liu // E.g. An interleaved load (Factor = 2):
241c1e0c9eSHao Liu //        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
259f2d9364SJuneyoung Lee //        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
269f2d9364SJuneyoung Lee //        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
271c1e0c9eSHao Liu //
281c1e0c9eSHao Liu // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
291c1e0c9eSHao Liu // intrinsic in ARM backend.
301c1e0c9eSHao Liu //
3101a057a0SDavid L Kreitzer // In X86, this can be further optimized into a set of target
3201a057a0SDavid L Kreitzer // specific loads followed by an optimized sequence of shuffles.
3301a057a0SDavid L Kreitzer //
341c1e0c9eSHao Liu // E.g. An interleaved store (Factor = 3):
351c1e0c9eSHao Liu //        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
361c1e0c9eSHao Liu //                                    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
371c1e0c9eSHao Liu //        store <12 x i32> %i.vec, <12 x i32>* %ptr
381c1e0c9eSHao Liu //
391c1e0c9eSHao Liu // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
401c1e0c9eSHao Liu // intrinsic in ARM backend.
411c1e0c9eSHao Liu //
4201a057a0SDavid L Kreitzer // Similarly, a set of interleaved stores can be transformed into an optimized
4301a057a0SDavid L Kreitzer // sequence of shuffles followed by a set of target specific stores for X86.
44f1933329SEugene Zelenko //
451c1e0c9eSHao Liu //===----------------------------------------------------------------------===//
461c1e0c9eSHao Liu 
47f1933329SEugene Zelenko #include "llvm/ADT/ArrayRef.h"
48f1933329SEugene Zelenko #include "llvm/ADT/DenseMap.h"
49ed98c1b3Sserge-sans-paille #include "llvm/ADT/SetVector.h"
50f1933329SEugene Zelenko #include "llvm/ADT/SmallVector.h"
51b3bde2eaSDavid Blaikie #include "llvm/CodeGen/TargetLowering.h"
528b61764cSFrancis Visoiu Mistrih #include "llvm/CodeGen/TargetPassConfig.h"
53b3bde2eaSDavid Blaikie #include "llvm/CodeGen/TargetSubtargetInfo.h"
54f1933329SEugene Zelenko #include "llvm/IR/Constants.h"
55476c0afcSMatthew Simpson #include "llvm/IR/Dominators.h"
56f1933329SEugene Zelenko #include "llvm/IR/Function.h"
57f1933329SEugene Zelenko #include "llvm/IR/IRBuilder.h"
581c1e0c9eSHao Liu #include "llvm/IR/InstIterator.h"
59f1933329SEugene Zelenko #include "llvm/IR/Instruction.h"
60f1933329SEugene Zelenko #include "llvm/IR/Instructions.h"
6105da2fe5SReid Kleckner #include "llvm/InitializePasses.h"
62f1933329SEugene Zelenko #include "llvm/Pass.h"
63f1933329SEugene Zelenko #include "llvm/Support/Casting.h"
64f1933329SEugene Zelenko #include "llvm/Support/CommandLine.h"
651c1e0c9eSHao Liu #include "llvm/Support/Debug.h"
661c1e0c9eSHao Liu #include "llvm/Support/MathExtras.h"
67b41c0b44SHao Liu #include "llvm/Support/raw_ostream.h"
68f1933329SEugene Zelenko #include "llvm/Target/TargetMachine.h"
69a4b6b1e1SDavid Green #include "llvm/Transforms/Utils/Local.h"
70f1933329SEugene Zelenko #include <cassert>
71f1933329SEugene Zelenko #include <utility>
721c1e0c9eSHao Liu 
731c1e0c9eSHao Liu using namespace llvm;
741c1e0c9eSHao Liu 
751c1e0c9eSHao Liu #define DEBUG_TYPE "interleaved-access"
761c1e0c9eSHao Liu 
771c1e0c9eSHao Liu static cl::opt<bool> LowerInterleavedAccesses(
781c1e0c9eSHao Liu     "lower-interleaved-accesses",
791c1e0c9eSHao Liu     cl::desc("Enable lowering interleaved accesses to intrinsics"),
806d3f05c0SSilviu Baranga     cl::init(true), cl::Hidden);
811c1e0c9eSHao Liu 
821c1e0c9eSHao Liu namespace {
831c1e0c9eSHao Liu 
841c1e0c9eSHao Liu class InterleavedAccess : public FunctionPass {
851c1e0c9eSHao Liu public:
861c1e0c9eSHao Liu   static char ID;
87f1933329SEugene Zelenko 
InterleavedAccess()88f1933329SEugene Zelenko   InterleavedAccess() : FunctionPass(ID) {
891c1e0c9eSHao Liu     initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
901c1e0c9eSHao Liu   }
911c1e0c9eSHao Liu 
getPassName() const92117296c0SMehdi Amini   StringRef getPassName() const override { return "Interleaved Access Pass"; }
931c1e0c9eSHao Liu 
941c1e0c9eSHao Liu   bool runOnFunction(Function &F) override;
951c1e0c9eSHao Liu 
getAnalysisUsage(AnalysisUsage & AU) const96476c0afcSMatthew Simpson   void getAnalysisUsage(AnalysisUsage &AU) const override {
97476c0afcSMatthew Simpson     AU.addRequired<DominatorTreeWrapperPass>();
98c49611f9SDavid Green     AU.setPreservesCFG();
99476c0afcSMatthew Simpson   }
100476c0afcSMatthew Simpson 
1011c1e0c9eSHao Liu private:
102f1933329SEugene Zelenko   DominatorTree *DT = nullptr;
103f1933329SEugene Zelenko   const TargetLowering *TLI = nullptr;
1041c1e0c9eSHao Liu 
1051e425c9fSBenjamin Kramer   /// The maximum supported interleave factor.
1061e425c9fSBenjamin Kramer   unsigned MaxFactor;
1071e425c9fSBenjamin Kramer 
1085f8f34e4SAdrian Prantl   /// Transform an interleaved load into target specific intrinsics.
1091c1e0c9eSHao Liu   bool lowerInterleavedLoad(LoadInst *LI,
1101c1e0c9eSHao Liu                             SmallVector<Instruction *, 32> &DeadInsts);
1111c1e0c9eSHao Liu 
1125f8f34e4SAdrian Prantl   /// Transform an interleaved store into target specific intrinsics.
1131c1e0c9eSHao Liu   bool lowerInterleavedStore(StoreInst *SI,
1141c1e0c9eSHao Liu                              SmallVector<Instruction *, 32> &DeadInsts);
115476c0afcSMatthew Simpson 
1165f8f34e4SAdrian Prantl   /// Returns true if the uses of an interleaved load by the
117476c0afcSMatthew Simpson   /// extractelement instructions in \p Extracts can be replaced by uses of the
118476c0afcSMatthew Simpson   /// shufflevector instructions in \p Shuffles instead. If so, the necessary
119476c0afcSMatthew Simpson   /// replacements are also performed.
120476c0afcSMatthew Simpson   bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
121476c0afcSMatthew Simpson                           ArrayRef<ShuffleVectorInst *> Shuffles);
122a4b6b1e1SDavid Green 
123a4b6b1e1SDavid Green   /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
124a4b6b1e1SDavid Green   /// to binop(shuffle(x), shuffle(y)) to allow the formation of an
125a4b6b1e1SDavid Green   /// interleaving load. Any newly created shuffles that operate on \p LI will
126ed936aadSFlorian Hahn   /// be added to \p Shuffles. Returns true, if any changes to the IR have been
127ed936aadSFlorian Hahn   /// made.
128ed936aadSFlorian Hahn   bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
129a4b6b1e1SDavid Green                             SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
130a4b6b1e1SDavid Green                             LoadInst *LI);
1311c1e0c9eSHao Liu };
132f1933329SEugene Zelenko 
1331c1e0c9eSHao Liu } // end anonymous namespace.
1341c1e0c9eSHao Liu 
1351c1e0c9eSHao Liu char InterleavedAccess::ID = 0;
136f1933329SEugene Zelenko 
1371527baabSMatthias Braun INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
138476c0afcSMatthew Simpson     "Lower interleaved memory accesses to target specific intrinsics", false,
139476c0afcSMatthew Simpson     false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)140476c0afcSMatthew Simpson INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1411527baabSMatthias Braun INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
142476c0afcSMatthew Simpson     "Lower interleaved memory accesses to target specific intrinsics", false,
143476c0afcSMatthew Simpson     false)
1441c1e0c9eSHao Liu 
1458b61764cSFrancis Visoiu Mistrih FunctionPass *llvm::createInterleavedAccessPass() {
1468b61764cSFrancis Visoiu Mistrih   return new InterleavedAccess();
1471c1e0c9eSHao Liu }
1481c1e0c9eSHao Liu 
1495f8f34e4SAdrian Prantl /// Check if the mask is a DE-interleave mask of the given factor
1501c1e0c9eSHao Liu /// \p Factor like:
1511c1e0c9eSHao Liu ///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
isDeInterleaveMaskOfFactor(ArrayRef<int> Mask,unsigned Factor,unsigned & Index)1521c1e0c9eSHao Liu static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
1531c1e0c9eSHao Liu                                        unsigned &Index) {
1541c1e0c9eSHao Liu   // Check all potential start indices from 0 to (Factor - 1).
1551c1e0c9eSHao Liu   for (Index = 0; Index < Factor; Index++) {
1561c1e0c9eSHao Liu     unsigned i = 0;
1571c1e0c9eSHao Liu 
1581c1e0c9eSHao Liu     // Check that elements are in ascending order by Factor. Ignore undef
1591c1e0c9eSHao Liu     // elements.
1601c1e0c9eSHao Liu     for (; i < Mask.size(); i++)
1611c1e0c9eSHao Liu       if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
1621c1e0c9eSHao Liu         break;
1631c1e0c9eSHao Liu 
1641c1e0c9eSHao Liu     if (i == Mask.size())
1651c1e0c9eSHao Liu       return true;
1661c1e0c9eSHao Liu   }
1671c1e0c9eSHao Liu 
1681c1e0c9eSHao Liu   return false;
1691c1e0c9eSHao Liu }
1701c1e0c9eSHao Liu 
1715f8f34e4SAdrian Prantl /// Check if the mask is a DE-interleave mask for an interleaved load.
1721c1e0c9eSHao Liu ///
1731c1e0c9eSHao Liu /// E.g. DE-interleave masks (Factor = 2) could be:
1741c1e0c9eSHao Liu ///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
1751c1e0c9eSHao Liu ///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
isDeInterleaveMask(ArrayRef<int> Mask,unsigned & Factor,unsigned & Index,unsigned MaxFactor,unsigned NumLoadElements)1761c1e0c9eSHao Liu static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
17796f295e2SEli Friedman                                unsigned &Index, unsigned MaxFactor,
17896f295e2SEli Friedman                                unsigned NumLoadElements) {
1791c1e0c9eSHao Liu   if (Mask.size() < 2)
1801c1e0c9eSHao Liu     return false;
1811c1e0c9eSHao Liu 
1821c1e0c9eSHao Liu   // Check potential Factors.
18396f295e2SEli Friedman   for (Factor = 2; Factor <= MaxFactor; Factor++) {
18496f295e2SEli Friedman     // Make sure we don't produce a load wider than the input load.
18596f295e2SEli Friedman     if (Mask.size() * Factor > NumLoadElements)
18696f295e2SEli Friedman       return false;
1871c1e0c9eSHao Liu     if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
1881c1e0c9eSHao Liu       return true;
18996f295e2SEli Friedman   }
1901c1e0c9eSHao Liu 
1911c1e0c9eSHao Liu   return false;
1921c1e0c9eSHao Liu }
1931c1e0c9eSHao Liu 
1945f8f34e4SAdrian Prantl /// Check if the mask can be used in an interleaved store.
19577c5eaaeSAlina Sbirlea //
19677c5eaaeSAlina Sbirlea /// It checks for a more general pattern than the RE-interleave mask.
19777c5eaaeSAlina Sbirlea /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
19877c5eaaeSAlina Sbirlea /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
19977c5eaaeSAlina Sbirlea /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
20077c5eaaeSAlina Sbirlea /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
2011c1e0c9eSHao Liu ///
20277c5eaaeSAlina Sbirlea /// The particular case of an RE-interleave mask is:
20377c5eaaeSAlina Sbirlea /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
20477c5eaaeSAlina Sbirlea /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
isReInterleaveMask(ArrayRef<int> Mask,unsigned & Factor,unsigned MaxFactor,unsigned OpNumElts)2051e425c9fSBenjamin Kramer static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
20601fa9622SMatthias Braun                                unsigned MaxFactor, unsigned OpNumElts) {
2071c1e0c9eSHao Liu   unsigned NumElts = Mask.size();
2081c1e0c9eSHao Liu   if (NumElts < 4)
2091c1e0c9eSHao Liu     return false;
2101c1e0c9eSHao Liu 
2111c1e0c9eSHao Liu   // Check potential Factors.
2121c1e0c9eSHao Liu   for (Factor = 2; Factor <= MaxFactor; Factor++) {
2131c1e0c9eSHao Liu     if (NumElts % Factor)
2141c1e0c9eSHao Liu       continue;
2151c1e0c9eSHao Liu 
21677c5eaaeSAlina Sbirlea     unsigned LaneLen = NumElts / Factor;
21777c5eaaeSAlina Sbirlea     if (!isPowerOf2_32(LaneLen))
2181c1e0c9eSHao Liu       continue;
2191c1e0c9eSHao Liu 
22077c5eaaeSAlina Sbirlea     // Check whether each element matches the general interleaved rule.
22177c5eaaeSAlina Sbirlea     // Ignore undef elements, as long as the defined elements match the rule.
22277c5eaaeSAlina Sbirlea     // Outer loop processes all factors (x, y, z in the above example)
22377c5eaaeSAlina Sbirlea     unsigned I = 0, J;
22477c5eaaeSAlina Sbirlea     for (; I < Factor; I++) {
22577c5eaaeSAlina Sbirlea       unsigned SavedLaneValue;
22677c5eaaeSAlina Sbirlea       unsigned SavedNoUndefs = 0;
22777c5eaaeSAlina Sbirlea 
22877c5eaaeSAlina Sbirlea       // Inner loop processes consecutive accesses (x, x+1... in the example)
22977c5eaaeSAlina Sbirlea       for (J = 0; J < LaneLen - 1; J++) {
23077c5eaaeSAlina Sbirlea         // Lane computes x's position in the Mask
23177c5eaaeSAlina Sbirlea         unsigned Lane = J * Factor + I;
23277c5eaaeSAlina Sbirlea         unsigned NextLane = Lane + Factor;
23377c5eaaeSAlina Sbirlea         int LaneValue = Mask[Lane];
23477c5eaaeSAlina Sbirlea         int NextLaneValue = Mask[NextLane];
23577c5eaaeSAlina Sbirlea 
23677c5eaaeSAlina Sbirlea         // If both are defined, values must be sequential
23777c5eaaeSAlina Sbirlea         if (LaneValue >= 0 && NextLaneValue >= 0 &&
23877c5eaaeSAlina Sbirlea             LaneValue + 1 != NextLaneValue)
2391c1e0c9eSHao Liu           break;
2401c1e0c9eSHao Liu 
24177c5eaaeSAlina Sbirlea         // If the next value is undef, save the current one as reference
24277c5eaaeSAlina Sbirlea         if (LaneValue >= 0 && NextLaneValue < 0) {
24377c5eaaeSAlina Sbirlea           SavedLaneValue = LaneValue;
24477c5eaaeSAlina Sbirlea           SavedNoUndefs = 1;
24577c5eaaeSAlina Sbirlea         }
24677c5eaaeSAlina Sbirlea 
24777c5eaaeSAlina Sbirlea         // Undefs are allowed, but defined elements must still be consecutive:
24877c5eaaeSAlina Sbirlea         // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, ....
24977c5eaaeSAlina Sbirlea         // Verify this by storing the last non-undef followed by an undef
25077c5eaaeSAlina Sbirlea         // Check that following non-undef masks are incremented with the
25177c5eaaeSAlina Sbirlea         // corresponding distance.
25277c5eaaeSAlina Sbirlea         if (SavedNoUndefs > 0 && LaneValue < 0) {
25377c5eaaeSAlina Sbirlea           SavedNoUndefs++;
25477c5eaaeSAlina Sbirlea           if (NextLaneValue >= 0 &&
25577c5eaaeSAlina Sbirlea               SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue)
25677c5eaaeSAlina Sbirlea             break;
25777c5eaaeSAlina Sbirlea         }
25877c5eaaeSAlina Sbirlea       }
25977c5eaaeSAlina Sbirlea 
26077c5eaaeSAlina Sbirlea       if (J < LaneLen - 1)
26177c5eaaeSAlina Sbirlea         break;
26277c5eaaeSAlina Sbirlea 
26377c5eaaeSAlina Sbirlea       int StartMask = 0;
26477c5eaaeSAlina Sbirlea       if (Mask[I] >= 0) {
26577c5eaaeSAlina Sbirlea         // Check that the start of the I range (J=0) is greater than 0
26677c5eaaeSAlina Sbirlea         StartMask = Mask[I];
26777c5eaaeSAlina Sbirlea       } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) {
26877c5eaaeSAlina Sbirlea         // StartMask defined by the last value in lane
26977c5eaaeSAlina Sbirlea         StartMask = Mask[(LaneLen - 1) * Factor + I] - J;
27077c5eaaeSAlina Sbirlea       } else if (SavedNoUndefs > 0) {
27177c5eaaeSAlina Sbirlea         // StartMask defined by some non-zero value in the j loop
27277c5eaaeSAlina Sbirlea         StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs);
27377c5eaaeSAlina Sbirlea       }
27477c5eaaeSAlina Sbirlea       // else StartMask remains set to 0, i.e. all elements are undefs
27577c5eaaeSAlina Sbirlea 
27677c5eaaeSAlina Sbirlea       if (StartMask < 0)
27777c5eaaeSAlina Sbirlea         break;
27801fa9622SMatthias Braun       // We must stay within the vectors; This case can happen with undefs.
27901fa9622SMatthias Braun       if (StartMask + LaneLen > OpNumElts*2)
28001fa9622SMatthias Braun         break;
28177c5eaaeSAlina Sbirlea     }
28277c5eaaeSAlina Sbirlea 
28377c5eaaeSAlina Sbirlea     // Found an interleaved mask of current factor.
28477c5eaaeSAlina Sbirlea     if (I == Factor)
2851c1e0c9eSHao Liu       return true;
2861c1e0c9eSHao Liu   }
2871c1e0c9eSHao Liu 
2881c1e0c9eSHao Liu   return false;
2891c1e0c9eSHao Liu }
2901c1e0c9eSHao Liu 
lowerInterleavedLoad(LoadInst * LI,SmallVector<Instruction *,32> & DeadInsts)2911c1e0c9eSHao Liu bool InterleavedAccess::lowerInterleavedLoad(
2921c1e0c9eSHao Liu     LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
293ff5b9a7bSChristopher Tetreault   if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
2941c1e0c9eSHao Liu     return false;
2951c1e0c9eSHao Liu 
296a4b6b1e1SDavid Green   // Check if all users of this load are shufflevectors. If we encounter any
297a4b6b1e1SDavid Green   // users that are extractelement instructions or binary operators, we save
298a4b6b1e1SDavid Green   // them to later check if they can be modified to extract from one of the
299a4b6b1e1SDavid Green   // shufflevectors instead of the load.
300a4b6b1e1SDavid Green 
3011c1e0c9eSHao Liu   SmallVector<ShuffleVectorInst *, 4> Shuffles;
302476c0afcSMatthew Simpson   SmallVector<ExtractElementInst *, 4> Extracts;
303a4b6b1e1SDavid Green   // BinOpShuffles need to be handled a single time in case both operands of the
304a4b6b1e1SDavid Green   // binop are the same load.
305a4b6b1e1SDavid Green   SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
3061c1e0c9eSHao Liu 
307a4b6b1e1SDavid Green   for (auto *User : LI->users()) {
308a4b6b1e1SDavid Green     auto *Extract = dyn_cast<ExtractElementInst>(User);
309476c0afcSMatthew Simpson     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
310476c0afcSMatthew Simpson       Extracts.push_back(Extract);
311476c0afcSMatthew Simpson       continue;
312476c0afcSMatthew Simpson     }
31328b41237SDavid Green     if (auto *BI = dyn_cast<BinaryOperator>(User)) {
31428b41237SDavid Green       if (all_of(BI->users(),
31528b41237SDavid Green                  [](auto *U) { return isa<ShuffleVectorInst>(U); })) {
31628b41237SDavid Green         for (auto *SVI : BI->users())
31728b41237SDavid Green           BinOpShuffles.insert(cast<ShuffleVectorInst>(SVI));
318a4b6b1e1SDavid Green         continue;
319a4b6b1e1SDavid Green       }
320a4b6b1e1SDavid Green     }
321a4b6b1e1SDavid Green     auto *SVI = dyn_cast<ShuffleVectorInst>(User);
3221c1e0c9eSHao Liu     if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
3231c1e0c9eSHao Liu       return false;
3241c1e0c9eSHao Liu 
3251c1e0c9eSHao Liu     Shuffles.push_back(SVI);
3261c1e0c9eSHao Liu   }
3271c1e0c9eSHao Liu 
328a4b6b1e1SDavid Green   if (Shuffles.empty() && BinOpShuffles.empty())
3291c1e0c9eSHao Liu     return false;
3301c1e0c9eSHao Liu 
3311c1e0c9eSHao Liu   unsigned Factor, Index;
3321c1e0c9eSHao Liu 
333ff5b9a7bSChristopher Tetreault   unsigned NumLoadElements =
334ff5b9a7bSChristopher Tetreault       cast<FixedVectorType>(LI->getType())->getNumElements();
335a4b6b1e1SDavid Green   auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
3361c1e0c9eSHao Liu   // Check if the first shufflevector is DE-interleave shuffle.
337a4b6b1e1SDavid Green   if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
338a4b6b1e1SDavid Green                           NumLoadElements))
3391c1e0c9eSHao Liu     return false;
3401c1e0c9eSHao Liu 
3411c1e0c9eSHao Liu   // Holds the corresponding index for each DE-interleave shuffle.
3421c1e0c9eSHao Liu   SmallVector<unsigned, 4> Indices;
3431c1e0c9eSHao Liu 
344a4b6b1e1SDavid Green   Type *VecTy = FirstSVI->getType();
3451c1e0c9eSHao Liu 
3461c1e0c9eSHao Liu   // Check if other shufflevectors are also DE-interleaved of the same type
3471c1e0c9eSHao Liu   // and factor as the first shufflevector.
348a4b6b1e1SDavid Green   for (auto *Shuffle : Shuffles) {
349a4b6b1e1SDavid Green     if (Shuffle->getType() != VecTy)
3501c1e0c9eSHao Liu       return false;
351a4b6b1e1SDavid Green     if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
3521c1e0c9eSHao Liu                                     Index))
3531c1e0c9eSHao Liu       return false;
3541c1e0c9eSHao Liu 
3559f2d9364SJuneyoung Lee     assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
3561c1e0c9eSHao Liu     Indices.push_back(Index);
3571c1e0c9eSHao Liu   }
358a4b6b1e1SDavid Green   for (auto *Shuffle : BinOpShuffles) {
359a4b6b1e1SDavid Green     if (Shuffle->getType() != VecTy)
360a4b6b1e1SDavid Green       return false;
361a4b6b1e1SDavid Green     if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
362a4b6b1e1SDavid Green                                     Index))
363a4b6b1e1SDavid Green       return false;
364a4b6b1e1SDavid Green 
3659f2d9364SJuneyoung Lee     assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
3669f2d9364SJuneyoung Lee 
367a4b6b1e1SDavid Green     if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
368a4b6b1e1SDavid Green       Indices.push_back(Index);
369a4b6b1e1SDavid Green     if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
370a4b6b1e1SDavid Green       Indices.push_back(Index);
371a4b6b1e1SDavid Green   }
3721c1e0c9eSHao Liu 
373476c0afcSMatthew Simpson   // Try and modify users of the load that are extractelement instructions to
374476c0afcSMatthew Simpson   // use the shufflevector instructions instead of the load.
375476c0afcSMatthew Simpson   if (!tryReplaceExtracts(Extracts, Shuffles))
376476c0afcSMatthew Simpson     return false;
377ed936aadSFlorian Hahn 
378ed936aadSFlorian Hahn   bool BinOpShuffleChanged =
379ed936aadSFlorian Hahn       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
380476c0afcSMatthew Simpson 
381d34e60caSNicola Zaghen   LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
3821c1e0c9eSHao Liu 
3831c1e0c9eSHao Liu   // Try to create target specific intrinsics to replace the load and shuffles.
384ed936aadSFlorian Hahn   if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
385ed936aadSFlorian Hahn     // If Extracts is not empty, tryReplaceExtracts made changes earlier.
386ed936aadSFlorian Hahn     return !Extracts.empty() || BinOpShuffleChanged;
387ed936aadSFlorian Hahn   }
3881c1e0c9eSHao Liu 
3890da15ea5SKazu Hirata   append_range(DeadInsts, Shuffles);
3901c1e0c9eSHao Liu 
3911c1e0c9eSHao Liu   DeadInsts.push_back(LI);
3921c1e0c9eSHao Liu   return true;
3931c1e0c9eSHao Liu }
3941c1e0c9eSHao Liu 
replaceBinOpShuffles(ArrayRef<ShuffleVectorInst * > BinOpShuffles,SmallVectorImpl<ShuffleVectorInst * > & Shuffles,LoadInst * LI)395ed936aadSFlorian Hahn bool InterleavedAccess::replaceBinOpShuffles(
396a4b6b1e1SDavid Green     ArrayRef<ShuffleVectorInst *> BinOpShuffles,
397a4b6b1e1SDavid Green     SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
398a4b6b1e1SDavid Green   for (auto *SVI : BinOpShuffles) {
399a4b6b1e1SDavid Green     BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
4009f2d9364SJuneyoung Lee     Type *BIOp0Ty = BI->getOperand(0)->getType();
401a4b6b1e1SDavid Green     ArrayRef<int> Mask = SVI->getShuffleMask();
4029f2d9364SJuneyoung Lee     assert(all_of(Mask, [&](int Idx) {
4039f2d9364SJuneyoung Lee       return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
4049f2d9364SJuneyoung Lee     }));
405a4b6b1e1SDavid Green 
4069f2d9364SJuneyoung Lee     auto *NewSVI1 =
4079f2d9364SJuneyoung Lee         new ShuffleVectorInst(BI->getOperand(0), PoisonValue::get(BIOp0Ty),
4089f2d9364SJuneyoung Lee                               Mask, SVI->getName(), SVI);
409a4b6b1e1SDavid Green     auto *NewSVI2 = new ShuffleVectorInst(
4109f2d9364SJuneyoung Lee         BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask,
411a4b6b1e1SDavid Green         SVI->getName(), SVI);
412fda8b471SDavid Green     BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags(
413fda8b471SDavid Green         BI->getOpcode(), NewSVI1, NewSVI2, BI, BI->getName(), SVI);
414a4b6b1e1SDavid Green     SVI->replaceAllUsesWith(NewBI);
415a4b6b1e1SDavid Green     LLVM_DEBUG(dbgs() << "  Replaced: " << *BI << "\n    And   : " << *SVI
416a4b6b1e1SDavid Green                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
417a4b6b1e1SDavid Green                       << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
418a4b6b1e1SDavid Green     RecursivelyDeleteTriviallyDeadInstructions(SVI);
419a4b6b1e1SDavid Green     if (NewSVI1->getOperand(0) == LI)
420a4b6b1e1SDavid Green       Shuffles.push_back(NewSVI1);
421a4b6b1e1SDavid Green     if (NewSVI2->getOperand(0) == LI)
422a4b6b1e1SDavid Green       Shuffles.push_back(NewSVI2);
423a4b6b1e1SDavid Green   }
424ed936aadSFlorian Hahn 
425ed936aadSFlorian Hahn   return !BinOpShuffles.empty();
426a4b6b1e1SDavid Green }
427a4b6b1e1SDavid Green 
tryReplaceExtracts(ArrayRef<ExtractElementInst * > Extracts,ArrayRef<ShuffleVectorInst * > Shuffles)428476c0afcSMatthew Simpson bool InterleavedAccess::tryReplaceExtracts(
429476c0afcSMatthew Simpson     ArrayRef<ExtractElementInst *> Extracts,
430476c0afcSMatthew Simpson     ArrayRef<ShuffleVectorInst *> Shuffles) {
431476c0afcSMatthew Simpson   // If there aren't any extractelement instructions to modify, there's nothing
432476c0afcSMatthew Simpson   // to do.
433476c0afcSMatthew Simpson   if (Extracts.empty())
434476c0afcSMatthew Simpson     return true;
435476c0afcSMatthew Simpson 
436476c0afcSMatthew Simpson   // Maps extractelement instructions to vector-index pairs. The extractlement
437476c0afcSMatthew Simpson   // instructions will be modified to use the new vector and index operands.
438476c0afcSMatthew Simpson   DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
439476c0afcSMatthew Simpson 
440476c0afcSMatthew Simpson   for (auto *Extract : Extracts) {
441476c0afcSMatthew Simpson     // The vector index that is extracted.
442476c0afcSMatthew Simpson     auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
443476c0afcSMatthew Simpson     auto Index = IndexOperand->getSExtValue();
444476c0afcSMatthew Simpson 
445476c0afcSMatthew Simpson     // Look for a suitable shufflevector instruction. The goal is to modify the
446476c0afcSMatthew Simpson     // extractelement instruction (which uses an interleaved load) to use one
447476c0afcSMatthew Simpson     // of the shufflevector instructions instead of the load.
448476c0afcSMatthew Simpson     for (auto *Shuffle : Shuffles) {
449476c0afcSMatthew Simpson       // If the shufflevector instruction doesn't dominate the extract, we
450476c0afcSMatthew Simpson       // can't create a use of it.
451476c0afcSMatthew Simpson       if (!DT->dominates(Shuffle, Extract))
452476c0afcSMatthew Simpson         continue;
453476c0afcSMatthew Simpson 
454476c0afcSMatthew Simpson       // Inspect the indices of the shufflevector instruction. If the shuffle
455476c0afcSMatthew Simpson       // selects the same index that is extracted, we can modify the
456476c0afcSMatthew Simpson       // extractelement instruction.
457476c0afcSMatthew Simpson       SmallVector<int, 4> Indices;
458476c0afcSMatthew Simpson       Shuffle->getShuffleMask(Indices);
459476c0afcSMatthew Simpson       for (unsigned I = 0; I < Indices.size(); ++I)
460476c0afcSMatthew Simpson         if (Indices[I] == Index) {
461476c0afcSMatthew Simpson           assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
462476c0afcSMatthew Simpson                  "Vector operations do not match");
463476c0afcSMatthew Simpson           ReplacementMap[Extract] = std::make_pair(Shuffle, I);
464476c0afcSMatthew Simpson           break;
465476c0afcSMatthew Simpson         }
466476c0afcSMatthew Simpson 
467476c0afcSMatthew Simpson       // If we found a suitable shufflevector instruction, stop looking.
468476c0afcSMatthew Simpson       if (ReplacementMap.count(Extract))
469476c0afcSMatthew Simpson         break;
470476c0afcSMatthew Simpson     }
471476c0afcSMatthew Simpson 
472476c0afcSMatthew Simpson     // If we did not find a suitable shufflevector instruction, the
473476c0afcSMatthew Simpson     // extractelement instruction cannot be modified, so we must give up.
474476c0afcSMatthew Simpson     if (!ReplacementMap.count(Extract))
475476c0afcSMatthew Simpson       return false;
476476c0afcSMatthew Simpson   }
477476c0afcSMatthew Simpson 
478476c0afcSMatthew Simpson   // Finally, perform the replacements.
479476c0afcSMatthew Simpson   IRBuilder<> Builder(Extracts[0]->getContext());
480476c0afcSMatthew Simpson   for (auto &Replacement : ReplacementMap) {
481476c0afcSMatthew Simpson     auto *Extract = Replacement.first;
482476c0afcSMatthew Simpson     auto *Vector = Replacement.second.first;
483476c0afcSMatthew Simpson     auto Index = Replacement.second.second;
484476c0afcSMatthew Simpson     Builder.SetInsertPoint(Extract);
485476c0afcSMatthew Simpson     Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
486476c0afcSMatthew Simpson     Extract->eraseFromParent();
487476c0afcSMatthew Simpson   }
488476c0afcSMatthew Simpson 
489476c0afcSMatthew Simpson   return true;
490476c0afcSMatthew Simpson }
491476c0afcSMatthew Simpson 
lowerInterleavedStore(StoreInst * SI,SmallVector<Instruction *,32> & DeadInsts)4921c1e0c9eSHao Liu bool InterleavedAccess::lowerInterleavedStore(
4931c1e0c9eSHao Liu     StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
4941c1e0c9eSHao Liu   if (!SI->isSimple())
4951c1e0c9eSHao Liu     return false;
4961c1e0c9eSHao Liu 
497a4b6b1e1SDavid Green   auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
498ff5b9a7bSChristopher Tetreault   if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
4991c1e0c9eSHao Liu     return false;
5001c1e0c9eSHao Liu 
5011c1e0c9eSHao Liu   // Check if the shufflevector is RE-interleave shuffle.
5021c1e0c9eSHao Liu   unsigned Factor;
503889f6606SChristopher Tetreault   unsigned OpNumElts =
504ff5b9a7bSChristopher Tetreault       cast<FixedVectorType>(SVI->getOperand(0)->getType())->getNumElements();
50501fa9622SMatthias Braun   if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
5061c1e0c9eSHao Liu     return false;
5071c1e0c9eSHao Liu 
508d34e60caSNicola Zaghen   LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
5091c1e0c9eSHao Liu 
5101c1e0c9eSHao Liu   // Try to create target specific intrinsics to replace the store and shuffle.
5111c1e0c9eSHao Liu   if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
5121c1e0c9eSHao Liu     return false;
5131c1e0c9eSHao Liu 
5141c1e0c9eSHao Liu   // Already have a new target specific interleaved store. Erase the old store.
5151c1e0c9eSHao Liu   DeadInsts.push_back(SI);
5161c1e0c9eSHao Liu   DeadInsts.push_back(SVI);
5171c1e0c9eSHao Liu   return true;
5181c1e0c9eSHao Liu }
5191c1e0c9eSHao Liu 
runOnFunction(Function & F)5201c1e0c9eSHao Liu bool InterleavedAccess::runOnFunction(Function &F) {
5218b61764cSFrancis Visoiu Mistrih   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
5228b61764cSFrancis Visoiu Mistrih   if (!TPC || !LowerInterleavedAccesses)
5231c1e0c9eSHao Liu     return false;
5241c1e0c9eSHao Liu 
525d34e60caSNicola Zaghen   LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
5261c1e0c9eSHao Liu 
527476c0afcSMatthew Simpson   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5288b61764cSFrancis Visoiu Mistrih   auto &TM = TPC->getTM<TargetMachine>();
5298b61764cSFrancis Visoiu Mistrih   TLI = TM.getSubtargetImpl(F)->getTargetLowering();
5301c1e0c9eSHao Liu   MaxFactor = TLI->getMaxSupportedInterleaveFactor();
5311c1e0c9eSHao Liu 
5321c1e0c9eSHao Liu   // Holds dead instructions that will be erased later.
5331c1e0c9eSHao Liu   SmallVector<Instruction *, 32> DeadInsts;
5341c1e0c9eSHao Liu   bool Changed = false;
5351c1e0c9eSHao Liu 
53678199518SNico Rieck   for (auto &I : instructions(F)) {
537a4b6b1e1SDavid Green     if (auto *LI = dyn_cast<LoadInst>(&I))
5381c1e0c9eSHao Liu       Changed |= lowerInterleavedLoad(LI, DeadInsts);
5391c1e0c9eSHao Liu 
540a4b6b1e1SDavid Green     if (auto *SI = dyn_cast<StoreInst>(&I))
5411c1e0c9eSHao Liu       Changed |= lowerInterleavedStore(SI, DeadInsts);
5421c1e0c9eSHao Liu   }
5431c1e0c9eSHao Liu 
544*9e6d1f4bSKazu Hirata   for (auto *I : DeadInsts)
5451c1e0c9eSHao Liu     I->eraseFromParent();
5461c1e0c9eSHao Liu 
5471c1e0c9eSHao Liu   return Changed;
5481c1e0c9eSHao Liu }
549