1*bb677cacSAndrew Litteken //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2*bb677cacSAndrew Litteken //
3*bb677cacSAndrew Litteken // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bb677cacSAndrew Litteken // See https://llvm.org/LICENSE.txt for license information.
5*bb677cacSAndrew Litteken // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bb677cacSAndrew Litteken //
7*bb677cacSAndrew Litteken //===----------------------------------------------------------------------===//
8*bb677cacSAndrew Litteken //
9*bb677cacSAndrew Litteken // This file implements the Suffix Tree class.
10*bb677cacSAndrew Litteken //
11*bb677cacSAndrew Litteken //===----------------------------------------------------------------------===//
12*bb677cacSAndrew Litteken 
13*bb677cacSAndrew Litteken #include "llvm/Support/SuffixTree.h"
14*bb677cacSAndrew Litteken #include "llvm/Support/Allocator.h"
15*bb677cacSAndrew Litteken #include <vector>
16*bb677cacSAndrew Litteken 
17*bb677cacSAndrew Litteken using namespace llvm;
18*bb677cacSAndrew Litteken 
SuffixTree(const std::vector<unsigned> & Str)19*bb677cacSAndrew Litteken SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
20*bb677cacSAndrew Litteken   Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
21*bb677cacSAndrew Litteken   Active.Node = Root;
22*bb677cacSAndrew Litteken 
23*bb677cacSAndrew Litteken   // Keep track of the number of suffixes we have to add of the current
24*bb677cacSAndrew Litteken   // prefix.
25*bb677cacSAndrew Litteken   unsigned SuffixesToAdd = 0;
26*bb677cacSAndrew Litteken 
27*bb677cacSAndrew Litteken   // Construct the suffix tree iteratively on each prefix of the string.
28*bb677cacSAndrew Litteken   // PfxEndIdx is the end index of the current prefix.
29*bb677cacSAndrew Litteken   // End is one past the last element in the string.
30*bb677cacSAndrew Litteken   for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
31*bb677cacSAndrew Litteken     SuffixesToAdd++;
32*bb677cacSAndrew Litteken     LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
33*bb677cacSAndrew Litteken     SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
34*bb677cacSAndrew Litteken   }
35*bb677cacSAndrew Litteken 
36*bb677cacSAndrew Litteken   // Set the suffix indices of each leaf.
37*bb677cacSAndrew Litteken   assert(Root && "Root node can't be nullptr!");
38*bb677cacSAndrew Litteken   setSuffixIndices();
39*bb677cacSAndrew Litteken }
40*bb677cacSAndrew Litteken 
insertLeaf(SuffixTreeNode & Parent,unsigned StartIdx,unsigned Edge)41*bb677cacSAndrew Litteken SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
42*bb677cacSAndrew Litteken                                        unsigned StartIdx, unsigned Edge) {
43*bb677cacSAndrew Litteken 
44*bb677cacSAndrew Litteken   assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
45*bb677cacSAndrew Litteken 
46*bb677cacSAndrew Litteken   SuffixTreeNode *N = new (NodeAllocator.Allocate())
47*bb677cacSAndrew Litteken       SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
48*bb677cacSAndrew Litteken   Parent.Children[Edge] = N;
49*bb677cacSAndrew Litteken 
50*bb677cacSAndrew Litteken   return N;
51*bb677cacSAndrew Litteken }
52*bb677cacSAndrew Litteken 
insertInternalNode(SuffixTreeNode * Parent,unsigned StartIdx,unsigned EndIdx,unsigned Edge)53*bb677cacSAndrew Litteken SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
54*bb677cacSAndrew Litteken                                                unsigned StartIdx,
55*bb677cacSAndrew Litteken                                                unsigned EndIdx, unsigned Edge) {
56*bb677cacSAndrew Litteken 
57*bb677cacSAndrew Litteken   assert(StartIdx <= EndIdx && "String can't start after it ends!");
58*bb677cacSAndrew Litteken   assert(!(!Parent && StartIdx != EmptyIdx) &&
59*bb677cacSAndrew Litteken          "Non-root internal nodes must have parents!");
60*bb677cacSAndrew Litteken 
61*bb677cacSAndrew Litteken   unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
62*bb677cacSAndrew Litteken   SuffixTreeNode *N =
63*bb677cacSAndrew Litteken       new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
64*bb677cacSAndrew Litteken   if (Parent)
65*bb677cacSAndrew Litteken     Parent->Children[Edge] = N;
66*bb677cacSAndrew Litteken 
67*bb677cacSAndrew Litteken   return N;
68*bb677cacSAndrew Litteken }
69*bb677cacSAndrew Litteken 
setSuffixIndices()70*bb677cacSAndrew Litteken void SuffixTree::setSuffixIndices() {
71*bb677cacSAndrew Litteken   // List of nodes we need to visit along with the current length of the
72*bb677cacSAndrew Litteken   // string.
73*bb677cacSAndrew Litteken   std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
74*bb677cacSAndrew Litteken 
75*bb677cacSAndrew Litteken   // Current node being visited.
76*bb677cacSAndrew Litteken   SuffixTreeNode *CurrNode = Root;
77*bb677cacSAndrew Litteken 
78*bb677cacSAndrew Litteken   // Sum of the lengths of the nodes down the path to the current one.
79*bb677cacSAndrew Litteken   unsigned CurrNodeLen = 0;
80*bb677cacSAndrew Litteken   ToVisit.push_back({CurrNode, CurrNodeLen});
81*bb677cacSAndrew Litteken   while (!ToVisit.empty()) {
82*bb677cacSAndrew Litteken     std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
83*bb677cacSAndrew Litteken     ToVisit.pop_back();
84*bb677cacSAndrew Litteken     CurrNode->ConcatLen = CurrNodeLen;
85*bb677cacSAndrew Litteken     for (auto &ChildPair : CurrNode->Children) {
86*bb677cacSAndrew Litteken       assert(ChildPair.second && "Node had a null child!");
87*bb677cacSAndrew Litteken       ToVisit.push_back(
88*bb677cacSAndrew Litteken           {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
89*bb677cacSAndrew Litteken     }
90*bb677cacSAndrew Litteken 
91*bb677cacSAndrew Litteken     // No children, so we are at the end of the string.
92*bb677cacSAndrew Litteken     if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
93*bb677cacSAndrew Litteken       CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
94*bb677cacSAndrew Litteken   }
95*bb677cacSAndrew Litteken }
96*bb677cacSAndrew Litteken 
extend(unsigned EndIdx,unsigned SuffixesToAdd)97*bb677cacSAndrew Litteken unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
98*bb677cacSAndrew Litteken   SuffixTreeNode *NeedsLink = nullptr;
99*bb677cacSAndrew Litteken 
100*bb677cacSAndrew Litteken   while (SuffixesToAdd > 0) {
101*bb677cacSAndrew Litteken 
102*bb677cacSAndrew Litteken     // Are we waiting to add anything other than just the last character?
103*bb677cacSAndrew Litteken     if (Active.Len == 0) {
104*bb677cacSAndrew Litteken       // If not, then say the active index is the end index.
105*bb677cacSAndrew Litteken       Active.Idx = EndIdx;
106*bb677cacSAndrew Litteken     }
107*bb677cacSAndrew Litteken 
108*bb677cacSAndrew Litteken     assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
109*bb677cacSAndrew Litteken 
110*bb677cacSAndrew Litteken     // The first character in the current substring we're looking at.
111*bb677cacSAndrew Litteken     unsigned FirstChar = Str[Active.Idx];
112*bb677cacSAndrew Litteken 
113*bb677cacSAndrew Litteken     // Have we inserted anything starting with FirstChar at the current node?
114*bb677cacSAndrew Litteken     if (Active.Node->Children.count(FirstChar) == 0) {
115*bb677cacSAndrew Litteken       // If not, then we can just insert a leaf and move to the next step.
116*bb677cacSAndrew Litteken       insertLeaf(*Active.Node, EndIdx, FirstChar);
117*bb677cacSAndrew Litteken 
118*bb677cacSAndrew Litteken       // The active node is an internal node, and we visited it, so it must
119*bb677cacSAndrew Litteken       // need a link if it doesn't have one.
120*bb677cacSAndrew Litteken       if (NeedsLink) {
121*bb677cacSAndrew Litteken         NeedsLink->Link = Active.Node;
122*bb677cacSAndrew Litteken         NeedsLink = nullptr;
123*bb677cacSAndrew Litteken       }
124*bb677cacSAndrew Litteken     } else {
125*bb677cacSAndrew Litteken       // There's a match with FirstChar, so look for the point in the tree to
126*bb677cacSAndrew Litteken       // insert a new node.
127*bb677cacSAndrew Litteken       SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
128*bb677cacSAndrew Litteken 
129*bb677cacSAndrew Litteken       unsigned SubstringLen = NextNode->size();
130*bb677cacSAndrew Litteken 
131*bb677cacSAndrew Litteken       // Is the current suffix we're trying to insert longer than the size of
132*bb677cacSAndrew Litteken       // the child we want to move to?
133*bb677cacSAndrew Litteken       if (Active.Len >= SubstringLen) {
134*bb677cacSAndrew Litteken         // If yes, then consume the characters we've seen and move to the next
135*bb677cacSAndrew Litteken         // node.
136*bb677cacSAndrew Litteken         Active.Idx += SubstringLen;
137*bb677cacSAndrew Litteken         Active.Len -= SubstringLen;
138*bb677cacSAndrew Litteken         Active.Node = NextNode;
139*bb677cacSAndrew Litteken         continue;
140*bb677cacSAndrew Litteken       }
141*bb677cacSAndrew Litteken 
142*bb677cacSAndrew Litteken       // Otherwise, the suffix we're trying to insert must be contained in the
143*bb677cacSAndrew Litteken       // next node we want to move to.
144*bb677cacSAndrew Litteken       unsigned LastChar = Str[EndIdx];
145*bb677cacSAndrew Litteken 
146*bb677cacSAndrew Litteken       // Is the string we're trying to insert a substring of the next node?
147*bb677cacSAndrew Litteken       if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
148*bb677cacSAndrew Litteken         // If yes, then we're done for this step. Remember our insertion point
149*bb677cacSAndrew Litteken         // and move to the next end index. At this point, we have an implicit
150*bb677cacSAndrew Litteken         // suffix tree.
151*bb677cacSAndrew Litteken         if (NeedsLink && !Active.Node->isRoot()) {
152*bb677cacSAndrew Litteken           NeedsLink->Link = Active.Node;
153*bb677cacSAndrew Litteken           NeedsLink = nullptr;
154*bb677cacSAndrew Litteken         }
155*bb677cacSAndrew Litteken 
156*bb677cacSAndrew Litteken         Active.Len++;
157*bb677cacSAndrew Litteken         break;
158*bb677cacSAndrew Litteken       }
159*bb677cacSAndrew Litteken 
160*bb677cacSAndrew Litteken       // The string we're trying to insert isn't a substring of the next node,
161*bb677cacSAndrew Litteken       // but matches up to a point. Split the node.
162*bb677cacSAndrew Litteken       //
163*bb677cacSAndrew Litteken       // For example, say we ended our search at a node n and we're trying to
164*bb677cacSAndrew Litteken       // insert ABD. Then we'll create a new node s for AB, reduce n to just
165*bb677cacSAndrew Litteken       // representing C, and insert a new leaf node l to represent d. This
166*bb677cacSAndrew Litteken       // allows us to ensure that if n was a leaf, it remains a leaf.
167*bb677cacSAndrew Litteken       //
168*bb677cacSAndrew Litteken       //   | ABC  ---split--->  | AB
169*bb677cacSAndrew Litteken       //   n                    s
170*bb677cacSAndrew Litteken       //                     C / \ D
171*bb677cacSAndrew Litteken       //                      n   l
172*bb677cacSAndrew Litteken 
173*bb677cacSAndrew Litteken       // The node s from the diagram
174*bb677cacSAndrew Litteken       SuffixTreeNode *SplitNode =
175*bb677cacSAndrew Litteken           insertInternalNode(Active.Node, NextNode->StartIdx,
176*bb677cacSAndrew Litteken                              NextNode->StartIdx + Active.Len - 1, FirstChar);
177*bb677cacSAndrew Litteken 
178*bb677cacSAndrew Litteken       // Insert the new node representing the new substring into the tree as
179*bb677cacSAndrew Litteken       // a child of the split node. This is the node l from the diagram.
180*bb677cacSAndrew Litteken       insertLeaf(*SplitNode, EndIdx, LastChar);
181*bb677cacSAndrew Litteken 
182*bb677cacSAndrew Litteken       // Make the old node a child of the split node and update its start
183*bb677cacSAndrew Litteken       // index. This is the node n from the diagram.
184*bb677cacSAndrew Litteken       NextNode->StartIdx += Active.Len;
185*bb677cacSAndrew Litteken       SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
186*bb677cacSAndrew Litteken 
187*bb677cacSAndrew Litteken       // SplitNode is an internal node, update the suffix link.
188*bb677cacSAndrew Litteken       if (NeedsLink)
189*bb677cacSAndrew Litteken         NeedsLink->Link = SplitNode;
190*bb677cacSAndrew Litteken 
191*bb677cacSAndrew Litteken       NeedsLink = SplitNode;
192*bb677cacSAndrew Litteken     }
193*bb677cacSAndrew Litteken 
194*bb677cacSAndrew Litteken     // We've added something new to the tree, so there's one less suffix to
195*bb677cacSAndrew Litteken     // add.
196*bb677cacSAndrew Litteken     SuffixesToAdd--;
197*bb677cacSAndrew Litteken 
198*bb677cacSAndrew Litteken     if (Active.Node->isRoot()) {
199*bb677cacSAndrew Litteken       if (Active.Len > 0) {
200*bb677cacSAndrew Litteken         Active.Len--;
201*bb677cacSAndrew Litteken         Active.Idx = EndIdx - SuffixesToAdd + 1;
202*bb677cacSAndrew Litteken       }
203*bb677cacSAndrew Litteken     } else {
204*bb677cacSAndrew Litteken       // Start the next phase at the next smallest suffix.
205*bb677cacSAndrew Litteken       Active.Node = Active.Node->Link;
206*bb677cacSAndrew Litteken     }
207*bb677cacSAndrew Litteken   }
208*bb677cacSAndrew Litteken 
209*bb677cacSAndrew Litteken   return SuffixesToAdd;
210*bb677cacSAndrew Litteken }
211