1 //===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
12 /// purpose of this pass is do some IR pattern matching to create ACLE
13 /// DSP intrinsics, which map on these 32-bit SIMD operations.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/Analysis/AliasAnalysis.h"
19 #include "llvm/Analysis/LoopAccessAnalysis.h"
20 #include "llvm/Analysis/LoopPass.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/NoFolder.h"
24 #include "llvm/Transforms/Scalar.h"
25 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
26 #include "llvm/Transforms/Utils/LoopUtils.h"
27 #include "llvm/Pass.h"
28 #include "llvm/PassRegistry.h"
29 #include "llvm/PassSupport.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/IR/PatternMatch.h"
32 #include "llvm/CodeGen/TargetPassConfig.h"
33 #include "ARM.h"
34 #include "ARMSubtarget.h"
35 
36 using namespace llvm;
37 using namespace PatternMatch;
38 
39 #define DEBUG_TYPE "parallel-dsp"
40 
41 namespace {
42   struct ParallelMAC;
43   struct Reduction;
44 
45   using ParallelMACList = SmallVector<ParallelMAC, 8>;
46   using ReductionList   = SmallVector<Reduction, 8>;
47   using ValueList       = SmallVector<Value*, 8>;
48   using LoadInstList    = SmallVector<LoadInst*, 8>;
49   using PMACPair        = std::pair<ParallelMAC*,ParallelMAC*>;
50   using PMACPairList    = SmallVector<PMACPair, 8>;
51   using Instructions    = SmallVector<Instruction*,16>;
52   using MemLocList      = SmallVector<MemoryLocation, 4>;
53 
54   // 'ParallelMAC' and 'Reduction' are just some bookkeeping data structures.
55   // 'Reduction' contains the phi-node and accumulator statement from where we
56   // start pattern matching, and 'ParallelMAC' the multiplication
57   // instructions that are candidates for parallel execution.
58   struct ParallelMAC {
59     Instruction *Mul;
60     ValueList    VL;        // List of all (narrow) operands of this Mul
61     LoadInstList VecLd;     // List of all load instructions of this Mul
62     MemLocList   MemLocs;   // All memory locations read by this Mul
63 
64     ParallelMAC(Instruction *I, ValueList &V) : Mul(I), VL(V) {};
65   };
66 
67   struct Reduction {
68     PHINode         *Phi;             // The Phi-node from where we start
69                                       // pattern matching.
70     Instruction     *AccIntAdd;       // The accumulating integer add statement,
71                                       // i.e, the reduction statement.
72 
73     Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
74   };
75 
76   class ARMParallelDSP : public LoopPass {
77     ScalarEvolution   *SE;
78     AliasAnalysis     *AA;
79     TargetLibraryInfo *TLI;
80     DominatorTree     *DT;
81     LoopInfo          *LI;
82     Loop              *L;
83     const DataLayout  *DL;
84     Module            *M;
85 
86     bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
87     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, LoadInstList &VecLd);
88     PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates);
89     Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
90                                  Instruction *Acc, Instruction *InsertAfter);
91 
92     /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
93     /// Dual performs two signed 16x16-bit multiplications. It adds the
94     /// products to a 32-bit accumulate operand. Optionally, the instruction can
95     /// exchange the halfwords of the second operand before performing the
96     /// arithmetic.
97     bool MatchSMLAD(Function &F);
98 
99   public:
100     static char ID;
101 
102     ARMParallelDSP() : LoopPass(ID) { }
103 
104     void getAnalysisUsage(AnalysisUsage &AU) const override {
105       LoopPass::getAnalysisUsage(AU);
106       AU.addRequired<AssumptionCacheTracker>();
107       AU.addRequired<ScalarEvolutionWrapperPass>();
108       AU.addRequired<AAResultsWrapperPass>();
109       AU.addRequired<TargetLibraryInfoWrapperPass>();
110       AU.addRequired<LoopInfoWrapperPass>();
111       AU.addRequired<DominatorTreeWrapperPass>();
112       AU.addRequired<TargetPassConfig>();
113       AU.addPreserved<LoopInfoWrapperPass>();
114       AU.setPreservesCFG();
115     }
116 
117     bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
118       L = TheLoop;
119       SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
120       AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
121       TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
122       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
123       LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
124       auto &TPC = getAnalysis<TargetPassConfig>();
125 
126       BasicBlock *Header = TheLoop->getHeader();
127       if (!Header)
128         return false;
129 
130       // TODO: We assume the loop header and latch to be the same block.
131       // This is not a fundamental restriction, but lifting this would just
132       // require more work to do the transformation and then patch up the CFG.
133       if (Header != TheLoop->getLoopLatch()) {
134         LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
135                              "running pass ARMParallelDSP\n");
136         return false;
137       }
138 
139       Function &F = *Header->getParent();
140       M = F.getParent();
141       DL = &M->getDataLayout();
142 
143       auto &TM = TPC.getTM<TargetMachine>();
144       auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
145 
146       if (!ST->allowsUnalignedMem()) {
147         LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
148                              "running pass ARMParallelDSP\n");
149         return false;
150       }
151 
152       if (!ST->hasDSP()) {
153         LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
154                              "ARMParallelDSP\n");
155         return false;
156       }
157 
158       LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
159       bool Changes = false;
160 
161       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n");
162       Changes = MatchSMLAD(F);
163       return Changes;
164     }
165   };
166 }
167 
168 template<unsigned BitWidth>
169 static bool IsNarrowSequence(Value *V, ValueList &VL) {
170   LLVM_DEBUG(dbgs() << "Is narrow sequence: "; V->dump());
171   ConstantInt *CInt;
172 
173   if (match(V, m_ConstantInt(CInt))) {
174     // TODO: if a constant is used, it needs to fit within the bit width.
175     return false;
176   }
177 
178   auto *I = dyn_cast<Instruction>(V);
179   if (!I)
180    return false;
181 
182   Value *Val, *LHS, *RHS;
183   bool isNarrow = false;
184 
185   if (match(V, m_Trunc(m_Value(Val)))) {
186     if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == BitWidth)
187       isNarrow = IsNarrowSequence<BitWidth>(Val, VL);
188   } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
189     // TODO: we need to implement sadd16/sadd8 for this, which enables to
190     // also do the rewrite for smlad8.ll, but it is unsupported for now.
191     isNarrow = false;
192   } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
193     if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() == BitWidth)
194       isNarrow = true;
195     else
196       LLVM_DEBUG(dbgs() << "Wrong SrcTy size of CastInst: " <<
197                  cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth());
198 
199     if (match(Val, m_Load(m_Value(Val)))) {
200       auto *Ld = dyn_cast<LoadInst>(I->getOperand(0));
201       LLVM_DEBUG(dbgs() << "Found narrow Load:\t"; Ld->dump());
202       VL.push_back(Ld);
203       isNarrow = true;
204     } else if (!isa<Instruction>(I->getOperand(0)))
205       VL.push_back(I->getOperand(0));
206   }
207 
208   if (isNarrow) {
209     LLVM_DEBUG(dbgs() << "Found narrow Op:\t"; I->dump());
210     VL.push_back(I);
211   } else
212     LLVM_DEBUG(dbgs() << "Found unsupported Op:\t"; I->dump());
213 
214   return isNarrow;
215 }
216 
217 // Element-by-element comparison of Value lists returning true if they are
218 // instructions with the same opcode or constants with the same value.
219 static bool AreSymmetrical(const ValueList &VL0,
220                            const ValueList &VL1) {
221   if (VL0.size() != VL1.size()) {
222     LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
223                       << VL0.size() << " != " << VL1.size() << "\n");
224     return false;
225   }
226 
227   const unsigned Pairs = VL0.size();
228   LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
229 
230   for (unsigned i = 0; i < Pairs; ++i) {
231     const Value *V0 = VL0[i];
232     const Value *V1 = VL1[i];
233     const auto *Inst0 = dyn_cast<Instruction>(V0);
234     const auto *Inst1 = dyn_cast<Instruction>(V1);
235 
236     LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
237                dbgs() << "mul1: "; V0->dump();
238                dbgs() << "mul2: "; V1->dump());
239 
240     if (!Inst0 || !Inst1)
241       return false;
242 
243     if (Inst0->isSameOperationAs(Inst1)) {
244       LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
245       continue;
246     }
247 
248     const APInt *C0, *C1;
249     if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
250       return false;
251   }
252 
253   LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
254   return true;
255 }
256 
257 bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
258                                         LoadInstList &VecLd) {
259   if (!Ld0 || !Ld1)
260     return false;
261 
262   LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
263     dbgs() << "Ld0:"; Ld0->dump();
264     dbgs() << "Ld1:"; Ld1->dump();
265   );
266 
267   if (!Ld0->isSimple() || !Ld1->isSimple()) {
268     LLVM_DEBUG(dbgs() << "No, not touching volatile loads\n");
269     return false;
270   }
271   if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
272     LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
273     return false;
274   }
275   if (isConsecutiveAccess(Ld0, Ld1, *DL, *SE)) {
276     VecLd.push_back(Ld0);
277     VecLd.push_back(Ld1);
278     LLVM_DEBUG(dbgs() << "OK: loads are consecutive.\n");
279     return true;
280   }
281   LLVM_DEBUG(dbgs() << "No, Ld0 and Ld1 aren't consecutive.\n");
282   return false;
283 }
284 
285 PMACPairList
286 ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
287   const unsigned Elems = Candidates.size();
288   PMACPairList PMACPairs;
289 
290   if (Elems < 2)
291     return PMACPairs;
292 
293   // TODO: for now we simply try to match consecutive pairs i and i+1.
294   // We can compare all elements, but then we need to compare and evaluate
295   // different solutions.
296   for(unsigned i=0; i<Elems-1; i+=2) {
297     ParallelMAC &PMul0 = Candidates[i];
298     ParallelMAC &PMul1 = Candidates[i+1];
299     const Instruction *Mul0 = PMul0.Mul;
300     const Instruction *Mul1 = PMul1.Mul;
301 
302     if (Mul0 == Mul1)
303       continue;
304 
305     LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
306                dbgs() << "- "; Mul0->dump();
307                dbgs() << "- "; Mul1->dump());
308 
309     const ValueList &VL0 = PMul0.VL;
310     const ValueList &VL1 = PMul1.VL;
311 
312     if (!AreSymmetrical(VL0, VL1))
313       continue;
314 
315     LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
316     // The first elements of each vector should be loads with sexts. If we find
317     // that its two pairs of consecutive loads, then these can be transformed
318     // into two wider loads and the users can be replaced with DSP
319     // intrinsics.
320     for (unsigned x = 0; x < VL0.size(); x += 4) {
321       auto *Ld0 = dyn_cast<LoadInst>(VL0[x]);
322       auto *Ld1 = dyn_cast<LoadInst>(VL1[x]);
323       auto *Ld2 = dyn_cast<LoadInst>(VL0[x+2]);
324       auto *Ld3 = dyn_cast<LoadInst>(VL1[x+2]);
325 
326       LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
327                  dbgs() << "\t mul1: "; VL0[x]->dump();
328                  dbgs() << "\t mul2: "; VL1[x]->dump();
329                  dbgs() << "and operands " << x + 2 << ":\n";
330                  dbgs() << "\t mul1: "; VL0[x+2]->dump();
331                  dbgs() << "\t mul2: "; VL1[x+2]->dump());
332 
333       if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) &&
334           AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) {
335         LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
336         PMACPairs.push_back(std::make_pair(&PMul0, &PMul1));
337       }
338     }
339   }
340   return PMACPairs;
341 }
342 
343 bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
344                                         PMACPairList &PMACPairs) {
345   Instruction *Acc = Reduction.Phi;
346   Instruction *InsertAfter = Reduction.AccIntAdd;
347 
348   for (auto &Pair : PMACPairs) {
349     LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
350                dbgs() << "- "; Pair.first->Mul->dump();
351                dbgs() << "- "; Pair.second->Mul->dump());
352     Acc = CreateSMLADCall(Pair.first->VecLd[0], Pair.second->VecLd[0], Acc,
353 	                        InsertAfter);
354     InsertAfter = Acc;
355   }
356 
357   if (Acc != Reduction.Phi) {
358     LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
359     Reduction.AccIntAdd->replaceAllUsesWith(Acc);
360     return true;
361   }
362   return false;
363 }
364 
365 static ReductionList MatchReductions(Function &F, Loop *TheLoop,
366                                      BasicBlock *Header) {
367   ReductionList Reductions;
368   RecurrenceDescriptor RecDesc;
369   const bool HasFnNoNaNAttr =
370     F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
371   const BasicBlock *Latch = TheLoop->getLoopLatch();
372 
373   // We need a preheader as getIncomingValueForBlock assumes there is one.
374   if (!TheLoop->getLoopPreheader())
375     return Reductions;
376 
377   for (PHINode &Phi : Header->phis()) {
378     const auto *Ty = Phi.getType();
379     if (!Ty->isIntegerTy(32))
380       continue;
381 
382     const bool IsReduction =
383       RecurrenceDescriptor::AddReductionVar(&Phi,
384                                             RecurrenceDescriptor::RK_IntegerAdd,
385                                             TheLoop, HasFnNoNaNAttr, RecDesc);
386     if (!IsReduction)
387       continue;
388 
389     Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
390     if (!Acc)
391       continue;
392 
393     Reductions.push_back(Reduction(&Phi, Acc));
394   }
395 
396   LLVM_DEBUG(
397     dbgs() << "\nAccumulating integer additions (reductions) found:\n";
398     for (auto R : Reductions) {
399       dbgs() << "-  "; R.Phi->dump();
400       dbgs() << "-> "; R.AccIntAdd->dump();
401     }
402   );
403   return Reductions;
404 }
405 
406 static void AddCandidateMAC(ParallelMACList &Candidates, const Instruction *Acc,
407                             Value *MulOp0, Value *MulOp1, int MulOpNum) {
408   Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
409   LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
410   ValueList VL;
411   if (IsNarrowSequence<16>(MulOp0, VL) &&
412       IsNarrowSequence<16>(MulOp1, VL)) {
413     LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
414     Candidates.push_back(ParallelMAC(Mul, VL));
415   }
416 }
417 
418 static ParallelMACList MatchParallelMACs(Reduction &R) {
419   ParallelMACList Candidates;
420   const Instruction *Acc = R.AccIntAdd;
421   Value *A, *MulOp0, *MulOp1;
422   LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
423 
424   // Pattern 1: the accumulator is the RHS of the mul.
425   while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
426                          m_Value(A)))){
427     AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0);
428     Acc = dyn_cast<Instruction>(A);
429   }
430   // Pattern 2: the accumulator is the LHS of the mul.
431   while(match(Acc, m_Add(m_Value(A),
432                          m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
433     AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 1);
434     Acc = dyn_cast<Instruction>(A);
435   }
436 
437   // The last mul in the chain has a slightly different pattern:
438   // the mul is the first operand
439   if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
440     AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0);
441 
442   // Because we start at the bottom of the chain, and we work our way up,
443   // the muls are added in reverse program order to the list.
444   std::reverse(Candidates.begin(), Candidates.end());
445   return Candidates;
446 }
447 
448 // Collects all instructions that are not part of the MAC chains, which is the
449 // set of instructions that can potentially alias with the MAC operands.
450 static Instructions AliasCandidates(BasicBlock *Header,
451                                     ParallelMACList &MACCandidates) {
452   Instructions Aliases;
453   auto IsMACCandidate = [] (Instruction *I, ParallelMACList &MACCandidates) {
454     for (auto &MAC : MACCandidates)
455       for (auto *Val : MAC.VL)
456         if (I == MAC.Mul || Val == I)
457           return true;
458    return false;
459   };
460 
461   std::for_each(Header->begin(), Header->end(),
462                 [&Aliases, &MACCandidates, &IsMACCandidate] (Instruction &I) {
463                   if (I.mayReadOrWriteMemory() &&
464                       !IsMACCandidate(&I, MACCandidates))
465                     Aliases.push_back(&I); });
466   return Aliases;
467 }
468 
469 // This compares all instructions from the "alias candidates" set, i.e., all
470 // instructions that are not part of the MAC-chain, with all instructions in
471 // the MAC candidate set, to see if instructions are aliased.
472 static bool AreAliased(AliasAnalysis *AA, Instructions AliasCandidates,
473                        ParallelMACList &MACCandidates) {
474   LLVM_DEBUG(dbgs() << "Alias checks:\n");
475   for (auto *I : AliasCandidates) {
476     LLVM_DEBUG(dbgs() << "- "; I->dump());
477     for (auto &MAC : MACCandidates) {
478       LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
479       assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs");
480       for (auto &MemLoc : MAC.MemLocs) {
481         if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
482                                           ModRefInfo::ModRef))) {
483           LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
484           return true;
485         }
486       }
487     }
488   }
489   LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
490   return false;
491 }
492 
493 static bool SetMemoryLocations(ParallelMACList &Candidates) {
494   const auto Size = MemoryLocation::UnknownSize;
495   for (auto &C : Candidates) {
496     // A mul has 2 operands, and a narrow op consist of sext and a load; thus
497     // we expect at least 4 items in this operand value list.
498     if (C.VL.size() < 4) {
499       LLVM_DEBUG(dbgs() << "Operand list too short.\n");
500       return false;
501     }
502 
503     for (unsigned i = 0; i < C.VL.size(); i += 4) {
504       auto *LdOp0 = dyn_cast<LoadInst>(C.VL[i]);
505       auto *LdOp1 = dyn_cast<LoadInst>(C.VL[i+2]);
506       if (!LdOp0 || !LdOp1)
507         return false;
508 
509       C.MemLocs.push_back(MemoryLocation(LdOp0->getPointerOperand(), Size));
510       C.MemLocs.push_back(MemoryLocation(LdOp1->getPointerOperand(), Size));
511     }
512   }
513   return true;
514 }
515 
516 // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
517 // multiplications.
518 // To use SMLAD:
519 // 1) we first need to find integer add reduction PHIs,
520 // 2) then from the PHI, look for this pattern:
521 //
522 // acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
523 // ld0 = load i16
524 // sext0 = sext i16 %ld0 to i32
525 // ld1 = load i16
526 // sext1 = sext i16 %ld1 to i32
527 // mul0 = mul %sext0, %sext1
528 // ld2 = load i16
529 // sext2 = sext i16 %ld2 to i32
530 // ld3 = load i16
531 // sext3 = sext i16 %ld3 to i32
532 // mul1 = mul i32 %sext2, %sext3
533 // add0 = add i32 %mul0, %acc0
534 // acc1 = add i32 %add0, %mul1
535 //
536 // Which can be selected to:
537 //
538 // ldr.h r0
539 // ldr.h r1
540 // smlad r2, r0, r1, r2
541 //
542 // If constants are used instead of loads, these will need to be hoisted
543 // out and into a register.
544 //
545 // If loop invariants are used instead of loads, these need to be packed
546 // before the loop begins.
547 //
548 // Can only be enabled for cores which support unaligned accesses.
549 //
550 bool ARMParallelDSP::MatchSMLAD(Function &F) {
551   BasicBlock *Header = L->getHeader();
552   LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
553              dbgs() << "Header block:\n"; Header->dump();
554              dbgs() << "Loop info:\n\n"; L->dump());
555 
556   bool Changed = false;
557   ReductionList Reductions = MatchReductions(F, L, Header);
558 
559   for (auto &R : Reductions) {
560     ParallelMACList MACCandidates = MatchParallelMACs(R);
561     if (!SetMemoryLocations(MACCandidates))
562       continue;
563     Instructions Aliases = AliasCandidates(Header, MACCandidates);
564     if (AreAliased(AA, Aliases, MACCandidates))
565       continue;
566     PMACPairList PMACPairs = CreateParallelMACPairs(MACCandidates);
567     Changed = InsertParallelMACs(R, PMACPairs) || Changed;
568   }
569 
570   LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
571   return Changed;
572 }
573 
574 static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc,
575                           LoadInst **VecLd) {
576   const Type *AccTy = Acc->getType();
577   const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace();
578 
579   Value *VecPtr = IRB.CreateBitCast((*VecLd)->getPointerOperand(),
580                                     AccTy->getPointerTo(AddrSpace));
581   *VecLd = IRB.CreateAlignedLoad(VecPtr, (*VecLd)->getAlignment());
582 }
583 
584 Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
585                                              Instruction *Acc,
586                                              Instruction *InsertAfter) {
587   LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n";
588              dbgs() << "- "; VecLd0->dump();
589              dbgs() << "- "; VecLd1->dump();
590              dbgs() << "- "; Acc->dump());
591 
592   IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
593                               ++BasicBlock::iterator(InsertAfter));
594 
595   // Replace the reduction chain with an intrinsic call
596   CreateLoadIns(Builder, Acc, &VecLd0);
597   CreateLoadIns(Builder, Acc, &VecLd1);
598   Value* Args[] = { VecLd0, VecLd1, Acc };
599   Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad);
600   CallInst *Call = Builder.CreateCall(SMLAD, Args);
601   return Call;
602 }
603 
604 Pass *llvm::createARMParallelDSPPass() {
605   return new ARMParallelDSP();
606 }
607 
608 char ARMParallelDSP::ID = 0;
609 
610 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "parallel-dsp",
611                 "Transform loops to use DSP intrinsics", false, false)
612 INITIALIZE_PASS_END(ARMParallelDSP, "parallel-dsp",
613                 "Transform loops to use DSP intrinsics", false, false)
614