1 //===- InterleavedAccessPass.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Interleaved Access pass, which identifies
10 // interleaved memory accesses and transforms them into target specific
11 // intrinsics.
12 //
13 // An interleaved load reads data from memory into several vectors, with
14 // DE-interleaving the data on a factor. An interleaved store writes several
15 // vectors to memory with RE-interleaving the data on a factor.
16 //
17 // As interleaved accesses are difficult to identified in CodeGen (mainly
18 // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
19 // IR), we identify and transform them to intrinsics in this pass so the
20 // intrinsics can be easily matched into target specific instructions later in
21 // CodeGen.
22 //
23 // E.g. An interleaved load (Factor = 2):
24 //        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
25 //        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
26 //        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
27 //
28 // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
29 // intrinsic in ARM backend.
30 //
31 // In X86, this can be further optimized into a set of target
32 // specific loads followed by an optimized sequence of shuffles.
33 //
34 // E.g. An interleaved store (Factor = 3):
35 //        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
36 //                                    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
37 //        store <12 x i32> %i.vec, <12 x i32>* %ptr
38 //
39 // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
40 // intrinsic in ARM backend.
41 //
42 // Similarly, a set of interleaved stores can be transformed into an optimized
43 // sequence of shuffles followed by a set of target specific stores for X86.
44 //
45 //===----------------------------------------------------------------------===//
46 
47 #include "llvm/ADT/ArrayRef.h"
48 #include "llvm/ADT/DenseMap.h"
49 #include "llvm/ADT/SmallVector.h"
50 #include "llvm/CodeGen/TargetLowering.h"
51 #include "llvm/CodeGen/TargetPassConfig.h"
52 #include "llvm/CodeGen/TargetSubtargetInfo.h"
53 #include "llvm/IR/Constants.h"
54 #include "llvm/IR/Dominators.h"
55 #include "llvm/IR/Function.h"
56 #include "llvm/IR/IRBuilder.h"
57 #include "llvm/IR/InstIterator.h"
58 #include "llvm/IR/Instruction.h"
59 #include "llvm/IR/Instructions.h"
60 #include "llvm/IR/Type.h"
61 #include "llvm/Pass.h"
62 #include "llvm/Support/Casting.h"
63 #include "llvm/Support/CommandLine.h"
64 #include "llvm/Support/Debug.h"
65 #include "llvm/Support/MathExtras.h"
66 #include "llvm/Support/raw_ostream.h"
67 #include "llvm/Target/TargetMachine.h"
68 #include <cassert>
69 #include <utility>
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "interleaved-access"
74 
75 static cl::opt<bool> LowerInterleavedAccesses(
76     "lower-interleaved-accesses",
77     cl::desc("Enable lowering interleaved accesses to intrinsics"),
78     cl::init(true), cl::Hidden);
79 
80 namespace {
81 
82 class InterleavedAccess : public FunctionPass {
83 public:
84   static char ID;
85 
86   InterleavedAccess() : FunctionPass(ID) {
87     initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
88   }
89 
90   StringRef getPassName() const override { return "Interleaved Access Pass"; }
91 
92   bool runOnFunction(Function &F) override;
93 
94   void getAnalysisUsage(AnalysisUsage &AU) const override {
95     AU.addRequired<DominatorTreeWrapperPass>();
96     AU.addPreserved<DominatorTreeWrapperPass>();
97   }
98 
99 private:
100   DominatorTree *DT = nullptr;
101   const TargetLowering *TLI = nullptr;
102 
103   /// The maximum supported interleave factor.
104   unsigned MaxFactor;
105 
106   /// Transform an interleaved load into target specific intrinsics.
107   bool lowerInterleavedLoad(LoadInst *LI,
108                             SmallVector<Instruction *, 32> &DeadInsts);
109 
110   /// Transform an interleaved store into target specific intrinsics.
111   bool lowerInterleavedStore(StoreInst *SI,
112                              SmallVector<Instruction *, 32> &DeadInsts);
113 
114   /// Returns true if the uses of an interleaved load by the
115   /// extractelement instructions in \p Extracts can be replaced by uses of the
116   /// shufflevector instructions in \p Shuffles instead. If so, the necessary
117   /// replacements are also performed.
118   bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
119                           ArrayRef<ShuffleVectorInst *> Shuffles);
120 };
121 
122 } // end anonymous namespace.
123 
124 char InterleavedAccess::ID = 0;
125 
126 INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
127     "Lower interleaved memory accesses to target specific intrinsics", false,
128     false)
129 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
130 INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
131     "Lower interleaved memory accesses to target specific intrinsics", false,
132     false)
133 
134 FunctionPass *llvm::createInterleavedAccessPass() {
135   return new InterleavedAccess();
136 }
137 
138 /// Check if the mask is a DE-interleave mask of the given factor
139 /// \p Factor like:
140 ///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
141 static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
142                                        unsigned &Index) {
143   // Check all potential start indices from 0 to (Factor - 1).
144   for (Index = 0; Index < Factor; Index++) {
145     unsigned i = 0;
146 
147     // Check that elements are in ascending order by Factor. Ignore undef
148     // elements.
149     for (; i < Mask.size(); i++)
150       if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
151         break;
152 
153     if (i == Mask.size())
154       return true;
155   }
156 
157   return false;
158 }
159 
160 /// Check if the mask is a DE-interleave mask for an interleaved load.
161 ///
162 /// E.g. DE-interleave masks (Factor = 2) could be:
163 ///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
164 ///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
165 static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
166                                unsigned &Index, unsigned MaxFactor) {
167   if (Mask.size() < 2)
168     return false;
169 
170   // Check potential Factors.
171   for (Factor = 2; Factor <= MaxFactor; Factor++)
172     if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
173       return true;
174 
175   return false;
176 }
177 
178 /// Check if the mask can be used in an interleaved store.
179 //
180 /// It checks for a more general pattern than the RE-interleave mask.
181 /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
182 /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
183 /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
184 /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
185 ///
186 /// The particular case of an RE-interleave mask is:
187 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
188 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
189 static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
190                                unsigned MaxFactor, unsigned OpNumElts) {
191   unsigned NumElts = Mask.size();
192   if (NumElts < 4)
193     return false;
194 
195   // Check potential Factors.
196   for (Factor = 2; Factor <= MaxFactor; Factor++) {
197     if (NumElts % Factor)
198       continue;
199 
200     unsigned LaneLen = NumElts / Factor;
201     if (!isPowerOf2_32(LaneLen))
202       continue;
203 
204     // Check whether each element matches the general interleaved rule.
205     // Ignore undef elements, as long as the defined elements match the rule.
206     // Outer loop processes all factors (x, y, z in the above example)
207     unsigned I = 0, J;
208     for (; I < Factor; I++) {
209       unsigned SavedLaneValue;
210       unsigned SavedNoUndefs = 0;
211 
212       // Inner loop processes consecutive accesses (x, x+1... in the example)
213       for (J = 0; J < LaneLen - 1; J++) {
214         // Lane computes x's position in the Mask
215         unsigned Lane = J * Factor + I;
216         unsigned NextLane = Lane + Factor;
217         int LaneValue = Mask[Lane];
218         int NextLaneValue = Mask[NextLane];
219 
220         // If both are defined, values must be sequential
221         if (LaneValue >= 0 && NextLaneValue >= 0 &&
222             LaneValue + 1 != NextLaneValue)
223           break;
224 
225         // If the next value is undef, save the current one as reference
226         if (LaneValue >= 0 && NextLaneValue < 0) {
227           SavedLaneValue = LaneValue;
228           SavedNoUndefs = 1;
229         }
230 
231         // Undefs are allowed, but defined elements must still be consecutive:
232         // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, ....
233         // Verify this by storing the last non-undef followed by an undef
234         // Check that following non-undef masks are incremented with the
235         // corresponding distance.
236         if (SavedNoUndefs > 0 && LaneValue < 0) {
237           SavedNoUndefs++;
238           if (NextLaneValue >= 0 &&
239               SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue)
240             break;
241         }
242       }
243 
244       if (J < LaneLen - 1)
245         break;
246 
247       int StartMask = 0;
248       if (Mask[I] >= 0) {
249         // Check that the start of the I range (J=0) is greater than 0
250         StartMask = Mask[I];
251       } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) {
252         // StartMask defined by the last value in lane
253         StartMask = Mask[(LaneLen - 1) * Factor + I] - J;
254       } else if (SavedNoUndefs > 0) {
255         // StartMask defined by some non-zero value in the j loop
256         StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs);
257       }
258       // else StartMask remains set to 0, i.e. all elements are undefs
259 
260       if (StartMask < 0)
261         break;
262       // We must stay within the vectors; This case can happen with undefs.
263       if (StartMask + LaneLen > OpNumElts*2)
264         break;
265     }
266 
267     // Found an interleaved mask of current factor.
268     if (I == Factor)
269       return true;
270   }
271 
272   return false;
273 }
274 
275 bool InterleavedAccess::lowerInterleavedLoad(
276     LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
277   if (!LI->isSimple())
278     return false;
279 
280   SmallVector<ShuffleVectorInst *, 4> Shuffles;
281   SmallVector<ExtractElementInst *, 4> Extracts;
282 
283   // Check if all users of this load are shufflevectors. If we encounter any
284   // users that are extractelement instructions, we save them to later check if
285   // they can be modifed to extract from one of the shufflevectors instead of
286   // the load.
287   for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
288     auto *Extract = dyn_cast<ExtractElementInst>(*UI);
289     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
290       Extracts.push_back(Extract);
291       continue;
292     }
293     ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
294     if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
295       return false;
296 
297     Shuffles.push_back(SVI);
298   }
299 
300   if (Shuffles.empty())
301     return false;
302 
303   unsigned Factor, Index;
304 
305   // Check if the first shufflevector is DE-interleave shuffle.
306   if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
307                           MaxFactor))
308     return false;
309 
310   // Holds the corresponding index for each DE-interleave shuffle.
311   SmallVector<unsigned, 4> Indices;
312   Indices.push_back(Index);
313 
314   Type *VecTy = Shuffles[0]->getType();
315 
316   // Check if other shufflevectors are also DE-interleaved of the same type
317   // and factor as the first shufflevector.
318   for (unsigned i = 1; i < Shuffles.size(); i++) {
319     if (Shuffles[i]->getType() != VecTy)
320       return false;
321 
322     if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
323                                     Index))
324       return false;
325 
326     Indices.push_back(Index);
327   }
328 
329   // Try and modify users of the load that are extractelement instructions to
330   // use the shufflevector instructions instead of the load.
331   if (!tryReplaceExtracts(Extracts, Shuffles))
332     return false;
333 
334   LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
335 
336   // Try to create target specific intrinsics to replace the load and shuffles.
337   if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
338     return false;
339 
340   for (auto SVI : Shuffles)
341     DeadInsts.push_back(SVI);
342 
343   DeadInsts.push_back(LI);
344   return true;
345 }
346 
347 bool InterleavedAccess::tryReplaceExtracts(
348     ArrayRef<ExtractElementInst *> Extracts,
349     ArrayRef<ShuffleVectorInst *> Shuffles) {
350   // If there aren't any extractelement instructions to modify, there's nothing
351   // to do.
352   if (Extracts.empty())
353     return true;
354 
355   // Maps extractelement instructions to vector-index pairs. The extractlement
356   // instructions will be modified to use the new vector and index operands.
357   DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
358 
359   for (auto *Extract : Extracts) {
360     // The vector index that is extracted.
361     auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
362     auto Index = IndexOperand->getSExtValue();
363 
364     // Look for a suitable shufflevector instruction. The goal is to modify the
365     // extractelement instruction (which uses an interleaved load) to use one
366     // of the shufflevector instructions instead of the load.
367     for (auto *Shuffle : Shuffles) {
368       // If the shufflevector instruction doesn't dominate the extract, we
369       // can't create a use of it.
370       if (!DT->dominates(Shuffle, Extract))
371         continue;
372 
373       // Inspect the indices of the shufflevector instruction. If the shuffle
374       // selects the same index that is extracted, we can modify the
375       // extractelement instruction.
376       SmallVector<int, 4> Indices;
377       Shuffle->getShuffleMask(Indices);
378       for (unsigned I = 0; I < Indices.size(); ++I)
379         if (Indices[I] == Index) {
380           assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
381                  "Vector operations do not match");
382           ReplacementMap[Extract] = std::make_pair(Shuffle, I);
383           break;
384         }
385 
386       // If we found a suitable shufflevector instruction, stop looking.
387       if (ReplacementMap.count(Extract))
388         break;
389     }
390 
391     // If we did not find a suitable shufflevector instruction, the
392     // extractelement instruction cannot be modified, so we must give up.
393     if (!ReplacementMap.count(Extract))
394       return false;
395   }
396 
397   // Finally, perform the replacements.
398   IRBuilder<> Builder(Extracts[0]->getContext());
399   for (auto &Replacement : ReplacementMap) {
400     auto *Extract = Replacement.first;
401     auto *Vector = Replacement.second.first;
402     auto Index = Replacement.second.second;
403     Builder.SetInsertPoint(Extract);
404     Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
405     Extract->eraseFromParent();
406   }
407 
408   return true;
409 }
410 
411 bool InterleavedAccess::lowerInterleavedStore(
412     StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
413   if (!SI->isSimple())
414     return false;
415 
416   ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
417   if (!SVI || !SVI->hasOneUse())
418     return false;
419 
420   // Check if the shufflevector is RE-interleave shuffle.
421   unsigned Factor;
422   unsigned OpNumElts = SVI->getOperand(0)->getType()->getVectorNumElements();
423   if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
424     return false;
425 
426   LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
427 
428   // Try to create target specific intrinsics to replace the store and shuffle.
429   if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
430     return false;
431 
432   // Already have a new target specific interleaved store. Erase the old store.
433   DeadInsts.push_back(SI);
434   DeadInsts.push_back(SVI);
435   return true;
436 }
437 
438 bool InterleavedAccess::runOnFunction(Function &F) {
439   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
440   if (!TPC || !LowerInterleavedAccesses)
441     return false;
442 
443   LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
444 
445   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
446   auto &TM = TPC->getTM<TargetMachine>();
447   TLI = TM.getSubtargetImpl(F)->getTargetLowering();
448   MaxFactor = TLI->getMaxSupportedInterleaveFactor();
449 
450   // Holds dead instructions that will be erased later.
451   SmallVector<Instruction *, 32> DeadInsts;
452   bool Changed = false;
453 
454   for (auto &I : instructions(F)) {
455     if (LoadInst *LI = dyn_cast<LoadInst>(&I))
456       Changed |= lowerInterleavedLoad(LI, DeadInsts);
457 
458     if (StoreInst *SI = dyn_cast<StoreInst>(&I))
459       Changed |= lowerInterleavedStore(SI, DeadInsts);
460   }
461 
462   for (auto I : DeadInsts)
463     I->eraseFromParent();
464 
465   return Changed;
466 }
467