1 //===- LoopCacheAnalysis.cpp - Loop Cache Analysis -------------------------==//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6 // See https://llvm.org/LICENSE.txt for license information.
7 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //
9 //===----------------------------------------------------------------------===//
10 ///
11 /// \file
12 /// This file defines the implementation for the loop cache analysis.
13 /// The implementation is largely based on the following paper:
14 ///
15 ///       Compiler Optimizations for Improving Data Locality
16 ///       By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng
17 ///       http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf
18 ///
19 /// The general approach taken to estimate the number of cache lines used by the
20 /// memory references in an inner loop is:
21 ///    1. Partition memory references that exhibit temporal or spacial reuse
22 ///       into reference groups.
23 ///    2. For each loop L in the a loop nest LN:
24 ///       a. Compute the cost of the reference group
25 ///       b. Compute the loop cost by summing up the reference groups costs
26 //===----------------------------------------------------------------------===//
27 
28 #include "llvm/Analysis/LoopCacheAnalysis.h"
29 #include "llvm/ADT/BreadthFirstIterator.h"
30 #include "llvm/ADT/Sequence.h"
31 #include "llvm/ADT/SmallVector.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/Debug.h"
34 
35 using namespace llvm;
36 
37 #define DEBUG_TYPE "loop-cache-cost"
38 
39 static cl::opt<unsigned> DefaultTripCount(
40     "default-trip-count", cl::init(100), cl::Hidden,
41     cl::desc("Use this to specify the default trip count of a loop"));
42 
43 // In this analysis two array references are considered to exhibit temporal
44 // reuse if they access either the same memory location, or a memory location
45 // with distance smaller than a configurable threshold.
46 static cl::opt<unsigned> TemporalReuseThreshold(
47     "temporal-reuse-threshold", cl::init(2), cl::Hidden,
48     cl::desc("Use this to specify the max. distance between array elements "
49              "accessed in a loop so that the elements are classified to have "
50              "temporal reuse"));
51 
52 /// Retrieve the innermost loop in the given loop nest \p Loops. It returns a
53 /// nullptr if any loops in the loop vector supplied has more than one sibling.
54 /// The loop vector is expected to contain loops collected in breadth-first
55 /// order.
56 static Loop *getInnerMostLoop(const LoopVectorTy &Loops) {
57   assert(!Loops.empty() && "Expecting a non-empy loop vector");
58 
59   Loop *LastLoop = Loops.back();
60   Loop *ParentLoop = LastLoop->getParentLoop();
61 
62   if (ParentLoop == nullptr) {
63     assert(Loops.size() == 1 && "Expecting a single loop");
64     return LastLoop;
65   }
66 
67   return (llvm::is_sorted(Loops,
68                           [](const Loop *L1, const Loop *L2) {
69                             return L1->getLoopDepth() < L2->getLoopDepth();
70                           }))
71              ? LastLoop
72              : nullptr;
73 }
74 
75 static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
76                                   const Loop &L, ScalarEvolution &SE) {
77   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&AccessFn);
78   if (!AR || !AR->isAffine())
79     return false;
80 
81   assert(AR->getLoop() && "AR should have a loop");
82 
83   // Check that start and increment are not add recurrences.
84   const SCEV *Start = AR->getStart();
85   const SCEV *Step = AR->getStepRecurrence(SE);
86   if (isa<SCEVAddRecExpr>(Start) || isa<SCEVAddRecExpr>(Step))
87     return false;
88 
89   // Check that start and increment are both invariant in the loop.
90   if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
91     return false;
92 
93   const SCEV *StepRec = AR->getStepRecurrence(SE);
94   if (StepRec && SE.isKnownNegative(StepRec))
95     StepRec = SE.getNegativeSCEV(StepRec);
96 
97   return StepRec == &ElemSize;
98 }
99 
100 /// Compute the trip count for the given loop \p L. Return the SCEV expression
101 /// for the trip count or nullptr if it cannot be computed.
102 static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
103   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
104   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
105       !isa<SCEVConstant>(BackedgeTakenCount))
106     return nullptr;
107 
108   return SE.getAddExpr(BackedgeTakenCount,
109                        SE.getOne(BackedgeTakenCount->getType()));
110 }
111 
112 //===----------------------------------------------------------------------===//
113 // IndexedReference implementation
114 //
115 raw_ostream &llvm::operator<<(raw_ostream &OS, const IndexedReference &R) {
116   if (!R.IsValid) {
117     OS << R.StoreOrLoadInst;
118     OS << ", IsValid=false.";
119     return OS;
120   }
121 
122   OS << *R.BasePointer;
123   for (const SCEV *Subscript : R.Subscripts)
124     OS << "[" << *Subscript << "]";
125 
126   OS << ", Sizes: ";
127   for (const SCEV *Size : R.Sizes)
128     OS << "[" << *Size << "]";
129 
130   return OS;
131 }
132 
133 IndexedReference::IndexedReference(Instruction &StoreOrLoadInst,
134                                    const LoopInfo &LI, ScalarEvolution &SE)
135     : StoreOrLoadInst(StoreOrLoadInst), SE(SE) {
136   assert((isa<StoreInst>(StoreOrLoadInst) || isa<LoadInst>(StoreOrLoadInst)) &&
137          "Expecting a load or store instruction");
138 
139   IsValid = delinearize(LI);
140   if (IsValid)
141     LLVM_DEBUG(dbgs().indent(2) << "Succesfully delinearized: " << *this
142                                 << "\n");
143 }
144 
145 Optional<bool> IndexedReference::hasSpacialReuse(const IndexedReference &Other,
146                                                  unsigned CLS,
147                                                  AliasAnalysis &AA) const {
148   assert(IsValid && "Expecting a valid reference");
149 
150   if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
151     LLVM_DEBUG(dbgs().indent(2)
152                << "No spacial reuse: different base pointers\n");
153     return false;
154   }
155 
156   unsigned NumSubscripts = getNumSubscripts();
157   if (NumSubscripts != Other.getNumSubscripts()) {
158     LLVM_DEBUG(dbgs().indent(2)
159                << "No spacial reuse: different number of subscripts\n");
160     return false;
161   }
162 
163   // all subscripts must be equal, except the leftmost one (the last one).
164   for (auto SubNum : seq<unsigned>(0, NumSubscripts - 1)) {
165     if (getSubscript(SubNum) != Other.getSubscript(SubNum)) {
166       LLVM_DEBUG(dbgs().indent(2) << "No spacial reuse, different subscripts: "
167                                   << "\n\t" << *getSubscript(SubNum) << "\n\t"
168                                   << *Other.getSubscript(SubNum) << "\n");
169       return false;
170     }
171   }
172 
173   // the difference between the last subscripts must be less than the cache line
174   // size.
175   const SCEV *LastSubscript = getLastSubscript();
176   const SCEV *OtherLastSubscript = Other.getLastSubscript();
177   const SCEVConstant *Diff = dyn_cast<SCEVConstant>(
178       SE.getMinusSCEV(LastSubscript, OtherLastSubscript));
179 
180   if (Diff == nullptr) {
181     LLVM_DEBUG(dbgs().indent(2)
182                << "No spacial reuse, difference between subscript:\n\t"
183                << *LastSubscript << "\n\t" << OtherLastSubscript
184                << "\nis not constant.\n");
185     return None;
186   }
187 
188   bool InSameCacheLine = (Diff->getValue()->getSExtValue() < CLS);
189 
190   LLVM_DEBUG({
191     if (InSameCacheLine)
192       dbgs().indent(2) << "Found spacial reuse.\n";
193     else
194       dbgs().indent(2) << "No spacial reuse.\n";
195   });
196 
197   return InSameCacheLine;
198 }
199 
200 Optional<bool> IndexedReference::hasTemporalReuse(const IndexedReference &Other,
201                                                   unsigned MaxDistance,
202                                                   const Loop &L,
203                                                   DependenceInfo &DI,
204                                                   AliasAnalysis &AA) const {
205   assert(IsValid && "Expecting a valid reference");
206 
207   if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
208     LLVM_DEBUG(dbgs().indent(2)
209                << "No temporal reuse: different base pointer\n");
210     return false;
211   }
212 
213   std::unique_ptr<Dependence> D =
214       DI.depends(&StoreOrLoadInst, &Other.StoreOrLoadInst, true);
215 
216   if (D == nullptr) {
217     LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: no dependence\n");
218     return false;
219   }
220 
221   if (D->isLoopIndependent()) {
222     LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
223     return true;
224   }
225 
226   // Check the dependence distance at every loop level. There is temporal reuse
227   // if the distance at the given loop's depth is small (|d| <= MaxDistance) and
228   // it is zero at every other loop level.
229   int LoopDepth = L.getLoopDepth();
230   int Levels = D->getLevels();
231   for (int Level = 1; Level <= Levels; ++Level) {
232     const SCEV *Distance = D->getDistance(Level);
233     const SCEVConstant *SCEVConst = dyn_cast_or_null<SCEVConstant>(Distance);
234 
235     if (SCEVConst == nullptr) {
236       LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: distance unknown\n");
237       return None;
238     }
239 
240     const ConstantInt &CI = *SCEVConst->getValue();
241     if (Level != LoopDepth && !CI.isZero()) {
242       LLVM_DEBUG(dbgs().indent(2)
243                  << "No temporal reuse: distance is not zero at depth=" << Level
244                  << "\n");
245       return false;
246     } else if (Level == LoopDepth && CI.getSExtValue() > MaxDistance) {
247       LLVM_DEBUG(
248           dbgs().indent(2)
249           << "No temporal reuse: distance is greater than MaxDistance at depth="
250           << Level << "\n");
251       return false;
252     }
253   }
254 
255   LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
256   return true;
257 }
258 
259 CacheCostTy IndexedReference::computeRefCost(const Loop &L,
260                                              unsigned CLS) const {
261   assert(IsValid && "Expecting a valid reference");
262   LLVM_DEBUG({
263     dbgs().indent(2) << "Computing cache cost for:\n";
264     dbgs().indent(4) << *this << "\n";
265   });
266 
267   // If the indexed reference is loop invariant the cost is one.
268   if (isLoopInvariant(L)) {
269     LLVM_DEBUG(dbgs().indent(4) << "Reference is loop invariant: RefCost=1\n");
270     return 1;
271   }
272 
273   const SCEV *TripCount = computeTripCount(L, SE);
274   if (!TripCount) {
275     LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
276                       << " could not be computed, using DefaultTripCount\n");
277     const SCEV *ElemSize = Sizes.back();
278     TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
279   }
280   LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
281 
282   // If the indexed reference is 'consecutive' the cost is
283   // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
284   const SCEV *RefCost = TripCount;
285 
286   if (isConsecutive(L, CLS)) {
287     const SCEV *Coeff = getLastCoefficient();
288     const SCEV *ElemSize = Sizes.back();
289     const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
290     const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
291     Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
292     if (SE.isKnownNegative(Stride))
293       Stride = SE.getNegativeSCEV(Stride);
294     Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
295     TripCount = SE.getNoopOrAnyExtend(TripCount, WiderType);
296     const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
297     RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
298 
299     LLVM_DEBUG(dbgs().indent(4)
300                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
301                << *RefCost << "\n");
302   } else
303     LLVM_DEBUG(dbgs().indent(4)
304                << "Access is not consecutive: RefCost=TripCount=" << *RefCost
305                << "\n");
306 
307   // Attempt to fold RefCost into a constant.
308   if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
309     return ConstantCost->getValue()->getSExtValue();
310 
311   LLVM_DEBUG(dbgs().indent(4)
312              << "RefCost is not a constant! Setting to RefCost=InvalidCost "
313                 "(invalid value).\n");
314 
315   return CacheCost::InvalidCost;
316 }
317 
318 bool IndexedReference::delinearize(const LoopInfo &LI) {
319   assert(Subscripts.empty() && "Subscripts should be empty");
320   assert(Sizes.empty() && "Sizes should be empty");
321   assert(!IsValid && "Should be called once from the constructor");
322   LLVM_DEBUG(dbgs() << "Delinearizing: " << StoreOrLoadInst << "\n");
323 
324   const SCEV *ElemSize = SE.getElementSize(&StoreOrLoadInst);
325   const BasicBlock *BB = StoreOrLoadInst.getParent();
326 
327   if (Loop *L = LI.getLoopFor(BB)) {
328     const SCEV *AccessFn =
329         SE.getSCEVAtScope(getPointerOperand(&StoreOrLoadInst), L);
330 
331     BasePointer = dyn_cast<SCEVUnknown>(SE.getPointerBase(AccessFn));
332     if (BasePointer == nullptr) {
333       LLVM_DEBUG(
334           dbgs().indent(2)
335           << "ERROR: failed to delinearize, can't identify base pointer\n");
336       return false;
337     }
338 
339     AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
340 
341     LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
342                                 << "', AccessFn: " << *AccessFn << "\n");
343 
344     SE.delinearize(AccessFn, Subscripts, Sizes,
345                    SE.getElementSize(&StoreOrLoadInst));
346 
347     if (Subscripts.empty() || Sizes.empty() ||
348         Subscripts.size() != Sizes.size()) {
349       // Attempt to determine whether we have a single dimensional array access.
350       // before giving up.
351       if (!isOneDimensionalArray(*AccessFn, *ElemSize, *L, SE)) {
352         LLVM_DEBUG(dbgs().indent(2)
353                    << "ERROR: failed to delinearize reference\n");
354         Subscripts.clear();
355         Sizes.clear();
356         return false;
357       }
358 
359       // The array may be accessed in reverse, for example:
360       //   for (i = N; i > 0; i--)
361       //     A[i] = 0;
362       // In this case, reconstruct the access function using the absolute value
363       // of the step recurrence.
364       const SCEVAddRecExpr *AccessFnAR = dyn_cast<SCEVAddRecExpr>(AccessFn);
365       const SCEV *StepRec = AccessFnAR ? AccessFnAR->getStepRecurrence(SE) : nullptr;
366 
367       if (StepRec && SE.isKnownNegative(StepRec))
368         AccessFn = SE.getAddRecExpr(AccessFnAR->getStart(),
369                                     SE.getNegativeSCEV(StepRec),
370                                     AccessFnAR->getLoop(),
371                                     AccessFnAR->getNoWrapFlags());
372       const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize);
373       Subscripts.push_back(Div);
374       Sizes.push_back(ElemSize);
375     }
376 
377     return all_of(Subscripts, [&](const SCEV *Subscript) {
378       return isSimpleAddRecurrence(*Subscript, *L);
379     });
380   }
381 
382   return false;
383 }
384 
385 bool IndexedReference::isLoopInvariant(const Loop &L) const {
386   Value *Addr = getPointerOperand(&StoreOrLoadInst);
387   assert(Addr != nullptr && "Expecting either a load or a store instruction");
388   assert(SE.isSCEVable(Addr->getType()) && "Addr should be SCEVable");
389 
390   if (SE.isLoopInvariant(SE.getSCEV(Addr), &L))
391     return true;
392 
393   // The indexed reference is loop invariant if none of the coefficients use
394   // the loop induction variable.
395   bool allCoeffForLoopAreZero = all_of(Subscripts, [&](const SCEV *Subscript) {
396     return isCoeffForLoopZeroOrInvariant(*Subscript, L);
397   });
398 
399   return allCoeffForLoopAreZero;
400 }
401 
402 bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
403   // The indexed reference is 'consecutive' if the only coefficient that uses
404   // the loop induction variable is the last one...
405   const SCEV *LastSubscript = Subscripts.back();
406   for (const SCEV *Subscript : Subscripts) {
407     if (Subscript == LastSubscript)
408       continue;
409     if (!isCoeffForLoopZeroOrInvariant(*Subscript, L))
410       return false;
411   }
412 
413   // ...and the access stride is less than the cache line size.
414   const SCEV *Coeff = getLastCoefficient();
415   const SCEV *ElemSize = Sizes.back();
416   const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
417   const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
418 
419   Stride = SE.isKnownNegative(Stride) ? SE.getNegativeSCEV(Stride) : Stride;
420   return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
421 }
422 
423 const SCEV *IndexedReference::getLastCoefficient() const {
424   const SCEV *LastSubscript = getLastSubscript();
425   assert(isa<SCEVAddRecExpr>(LastSubscript) &&
426          "Expecting a SCEV add recurrence expression");
427   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LastSubscript);
428   return AR->getStepRecurrence(SE);
429 }
430 
431 bool IndexedReference::isCoeffForLoopZeroOrInvariant(const SCEV &Subscript,
432                                                      const Loop &L) const {
433   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&Subscript);
434   return (AR != nullptr) ? AR->getLoop() != &L
435                          : SE.isLoopInvariant(&Subscript, &L);
436 }
437 
438 bool IndexedReference::isSimpleAddRecurrence(const SCEV &Subscript,
439                                              const Loop &L) const {
440   if (!isa<SCEVAddRecExpr>(Subscript))
441     return false;
442 
443   const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(&Subscript);
444   assert(AR->getLoop() && "AR should have a loop");
445 
446   if (!AR->isAffine())
447     return false;
448 
449   const SCEV *Start = AR->getStart();
450   const SCEV *Step = AR->getStepRecurrence(SE);
451 
452   if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
453     return false;
454 
455   return true;
456 }
457 
458 bool IndexedReference::isAliased(const IndexedReference &Other,
459                                  AliasAnalysis &AA) const {
460   const auto &Loc1 = MemoryLocation::get(&StoreOrLoadInst);
461   const auto &Loc2 = MemoryLocation::get(&Other.StoreOrLoadInst);
462   return AA.isMustAlias(Loc1, Loc2);
463 }
464 
465 //===----------------------------------------------------------------------===//
466 // CacheCost implementation
467 //
468 raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) {
469   for (const auto &LC : CC.LoopCosts) {
470     const Loop *L = LC.first;
471     OS << "Loop '" << L->getName() << "' has cost = " << LC.second << "\n";
472   }
473   return OS;
474 }
475 
476 CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI,
477                      ScalarEvolution &SE, TargetTransformInfo &TTI,
478                      AliasAnalysis &AA, DependenceInfo &DI,
479                      Optional<unsigned> TRT)
480     : Loops(Loops), TripCounts(), LoopCosts(),
481       TRT((TRT == None) ? Optional<unsigned>(TemporalReuseThreshold) : TRT),
482       LI(LI), SE(SE), TTI(TTI), AA(AA), DI(DI) {
483   assert(!Loops.empty() && "Expecting a non-empty loop vector.");
484 
485   for (const Loop *L : Loops) {
486     unsigned TripCount = SE.getSmallConstantTripCount(L);
487     TripCount = (TripCount == 0) ? DefaultTripCount : TripCount;
488     TripCounts.push_back({L, TripCount});
489   }
490 
491   calculateCacheFootprint();
492 }
493 
494 std::unique_ptr<CacheCost>
495 CacheCost::getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR,
496                         DependenceInfo &DI, Optional<unsigned> TRT) {
497   if (Root.getParentLoop()) {
498     LLVM_DEBUG(dbgs() << "Expecting the outermost loop in a loop nest\n");
499     return nullptr;
500   }
501 
502   LoopVectorTy Loops;
503   for (Loop *L : breadth_first(&Root))
504     Loops.push_back(L);
505 
506   if (!getInnerMostLoop(Loops)) {
507     LLVM_DEBUG(dbgs() << "Cannot compute cache cost of loop nest with more "
508                          "than one innermost loop\n");
509     return nullptr;
510   }
511 
512   return std::make_unique<CacheCost>(Loops, AR.LI, AR.SE, AR.TTI, AR.AA, DI, TRT);
513 }
514 
515 void CacheCost::calculateCacheFootprint() {
516   LLVM_DEBUG(dbgs() << "POPULATING REFERENCE GROUPS\n");
517   ReferenceGroupsTy RefGroups;
518   if (!populateReferenceGroups(RefGroups))
519     return;
520 
521   LLVM_DEBUG(dbgs() << "COMPUTING LOOP CACHE COSTS\n");
522   for (const Loop *L : Loops) {
523     assert((std::find_if(LoopCosts.begin(), LoopCosts.end(),
524                          [L](const LoopCacheCostTy &LCC) {
525                            return LCC.first == L;
526                          }) == LoopCosts.end()) &&
527            "Should not add duplicate element");
528     CacheCostTy LoopCost = computeLoopCacheCost(*L, RefGroups);
529     LoopCosts.push_back(std::make_pair(L, LoopCost));
530   }
531 
532   sortLoopCosts();
533   RefGroups.clear();
534 }
535 
536 bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const {
537   assert(RefGroups.empty() && "Reference groups should be empty");
538 
539   unsigned CLS = TTI.getCacheLineSize();
540   Loop *InnerMostLoop = getInnerMostLoop(Loops);
541   assert(InnerMostLoop != nullptr && "Expecting a valid innermost loop");
542 
543   for (BasicBlock *BB : InnerMostLoop->getBlocks()) {
544     for (Instruction &I : *BB) {
545       if (!isa<StoreInst>(I) && !isa<LoadInst>(I))
546         continue;
547 
548       std::unique_ptr<IndexedReference> R(new IndexedReference(I, LI, SE));
549       if (!R->isValid())
550         continue;
551 
552       bool Added = false;
553       for (ReferenceGroupTy &RefGroup : RefGroups) {
554         const IndexedReference &Representative = *RefGroup.front().get();
555         LLVM_DEBUG({
556           dbgs() << "References:\n";
557           dbgs().indent(2) << *R << "\n";
558           dbgs().indent(2) << Representative << "\n";
559         });
560 
561 
562        // FIXME: Both positive and negative access functions will be placed
563        // into the same reference group, resulting in a bi-directional array
564        // access such as:
565        //   for (i = N; i > 0; i--)
566        //     A[i] = A[N - i];
567        // having the same cost calculation as a single dimention access pattern
568        //   for (i = 0; i < N; i++)
569        //     A[i] = A[i];
570        // when in actuality, depending on the array size, the first example
571        // should have a cost closer to 2x the second due to the two cache
572        // access per iteration from opposite ends of the array
573         Optional<bool> HasTemporalReuse =
574             R->hasTemporalReuse(Representative, *TRT, *InnerMostLoop, DI, AA);
575         Optional<bool> HasSpacialReuse =
576             R->hasSpacialReuse(Representative, CLS, AA);
577 
578         if ((HasTemporalReuse.hasValue() && *HasTemporalReuse) ||
579             (HasSpacialReuse.hasValue() && *HasSpacialReuse)) {
580           RefGroup.push_back(std::move(R));
581           Added = true;
582           break;
583         }
584       }
585 
586       if (!Added) {
587         ReferenceGroupTy RG;
588         RG.push_back(std::move(R));
589         RefGroups.push_back(std::move(RG));
590       }
591     }
592   }
593 
594   if (RefGroups.empty())
595     return false;
596 
597   LLVM_DEBUG({
598     dbgs() << "\nIDENTIFIED REFERENCE GROUPS:\n";
599     int n = 1;
600     for (const ReferenceGroupTy &RG : RefGroups) {
601       dbgs().indent(2) << "RefGroup " << n << ":\n";
602       for (const auto &IR : RG)
603         dbgs().indent(4) << *IR << "\n";
604       n++;
605     }
606     dbgs() << "\n";
607   });
608 
609   return true;
610 }
611 
612 CacheCostTy
613 CacheCost::computeLoopCacheCost(const Loop &L,
614                                 const ReferenceGroupsTy &RefGroups) const {
615   if (!L.isLoopSimplifyForm())
616     return InvalidCost;
617 
618   LLVM_DEBUG(dbgs() << "Considering loop '" << L.getName()
619                     << "' as innermost loop.\n");
620 
621   // Compute the product of the trip counts of each other loop in the nest.
622   CacheCostTy TripCountsProduct = 1;
623   for (const auto &TC : TripCounts) {
624     if (TC.first == &L)
625       continue;
626     TripCountsProduct *= TC.second;
627   }
628 
629   CacheCostTy LoopCost = 0;
630   for (const ReferenceGroupTy &RG : RefGroups) {
631     CacheCostTy RefGroupCost = computeRefGroupCacheCost(RG, L);
632     LoopCost += RefGroupCost * TripCountsProduct;
633   }
634 
635   LLVM_DEBUG(dbgs().indent(2) << "Loop '" << L.getName()
636                               << "' has cost=" << LoopCost << "\n");
637 
638   return LoopCost;
639 }
640 
641 CacheCostTy CacheCost::computeRefGroupCacheCost(const ReferenceGroupTy &RG,
642                                                 const Loop &L) const {
643   assert(!RG.empty() && "Reference group should have at least one member.");
644 
645   const IndexedReference *Representative = RG.front().get();
646   return Representative->computeRefCost(L, TTI.getCacheLineSize());
647 }
648 
649 //===----------------------------------------------------------------------===//
650 // LoopCachePrinterPass implementation
651 //
652 PreservedAnalyses LoopCachePrinterPass::run(Loop &L, LoopAnalysisManager &AM,
653                                             LoopStandardAnalysisResults &AR,
654                                             LPMUpdater &U) {
655   Function *F = L.getHeader()->getParent();
656   DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
657 
658   if (auto CC = CacheCost::getCacheCost(L, AR, DI))
659     OS << *CC;
660 
661   return PreservedAnalyses::all();
662 }
663