1*5ffd83dbSDimitry Andric //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2*5ffd83dbSDimitry Andric //
3*5ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*5ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*5ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*5ffd83dbSDimitry Andric //
7*5ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
8*5ffd83dbSDimitry Andric //
9*5ffd83dbSDimitry Andric // This file implements the Suffix Tree class.
10*5ffd83dbSDimitry Andric //
11*5ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
12*5ffd83dbSDimitry Andric 
13*5ffd83dbSDimitry Andric #include "llvm/Support/SuffixTree.h"
14*5ffd83dbSDimitry Andric #include "llvm/Support/Allocator.h"
15*5ffd83dbSDimitry Andric #include <vector>
16*5ffd83dbSDimitry Andric 
17*5ffd83dbSDimitry Andric using namespace llvm;
18*5ffd83dbSDimitry Andric 
SuffixTree(const std::vector<unsigned> & Str)19*5ffd83dbSDimitry Andric SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
20*5ffd83dbSDimitry Andric   Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
21*5ffd83dbSDimitry Andric   Active.Node = Root;
22*5ffd83dbSDimitry Andric 
23*5ffd83dbSDimitry Andric   // Keep track of the number of suffixes we have to add of the current
24*5ffd83dbSDimitry Andric   // prefix.
25*5ffd83dbSDimitry Andric   unsigned SuffixesToAdd = 0;
26*5ffd83dbSDimitry Andric 
27*5ffd83dbSDimitry Andric   // Construct the suffix tree iteratively on each prefix of the string.
28*5ffd83dbSDimitry Andric   // PfxEndIdx is the end index of the current prefix.
29*5ffd83dbSDimitry Andric   // End is one past the last element in the string.
30*5ffd83dbSDimitry Andric   for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
31*5ffd83dbSDimitry Andric     SuffixesToAdd++;
32*5ffd83dbSDimitry Andric     LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
33*5ffd83dbSDimitry Andric     SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
34*5ffd83dbSDimitry Andric   }
35*5ffd83dbSDimitry Andric 
36*5ffd83dbSDimitry Andric   // Set the suffix indices of each leaf.
37*5ffd83dbSDimitry Andric   assert(Root && "Root node can't be nullptr!");
38*5ffd83dbSDimitry Andric   setSuffixIndices();
39*5ffd83dbSDimitry Andric }
40*5ffd83dbSDimitry Andric 
insertLeaf(SuffixTreeNode & Parent,unsigned StartIdx,unsigned Edge)41*5ffd83dbSDimitry Andric SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
42*5ffd83dbSDimitry Andric                                        unsigned StartIdx, unsigned Edge) {
43*5ffd83dbSDimitry Andric 
44*5ffd83dbSDimitry Andric   assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
45*5ffd83dbSDimitry Andric 
46*5ffd83dbSDimitry Andric   SuffixTreeNode *N = new (NodeAllocator.Allocate())
47*5ffd83dbSDimitry Andric       SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
48*5ffd83dbSDimitry Andric   Parent.Children[Edge] = N;
49*5ffd83dbSDimitry Andric 
50*5ffd83dbSDimitry Andric   return N;
51*5ffd83dbSDimitry Andric }
52*5ffd83dbSDimitry Andric 
insertInternalNode(SuffixTreeNode * Parent,unsigned StartIdx,unsigned EndIdx,unsigned Edge)53*5ffd83dbSDimitry Andric SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
54*5ffd83dbSDimitry Andric                                                unsigned StartIdx,
55*5ffd83dbSDimitry Andric                                                unsigned EndIdx, unsigned Edge) {
56*5ffd83dbSDimitry Andric 
57*5ffd83dbSDimitry Andric   assert(StartIdx <= EndIdx && "String can't start after it ends!");
58*5ffd83dbSDimitry Andric   assert(!(!Parent && StartIdx != EmptyIdx) &&
59*5ffd83dbSDimitry Andric          "Non-root internal nodes must have parents!");
60*5ffd83dbSDimitry Andric 
61*5ffd83dbSDimitry Andric   unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
62*5ffd83dbSDimitry Andric   SuffixTreeNode *N =
63*5ffd83dbSDimitry Andric       new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
64*5ffd83dbSDimitry Andric   if (Parent)
65*5ffd83dbSDimitry Andric     Parent->Children[Edge] = N;
66*5ffd83dbSDimitry Andric 
67*5ffd83dbSDimitry Andric   return N;
68*5ffd83dbSDimitry Andric }
69*5ffd83dbSDimitry Andric 
setSuffixIndices()70*5ffd83dbSDimitry Andric void SuffixTree::setSuffixIndices() {
71*5ffd83dbSDimitry Andric   // List of nodes we need to visit along with the current length of the
72*5ffd83dbSDimitry Andric   // string.
73*5ffd83dbSDimitry Andric   std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
74*5ffd83dbSDimitry Andric 
75*5ffd83dbSDimitry Andric   // Current node being visited.
76*5ffd83dbSDimitry Andric   SuffixTreeNode *CurrNode = Root;
77*5ffd83dbSDimitry Andric 
78*5ffd83dbSDimitry Andric   // Sum of the lengths of the nodes down the path to the current one.
79*5ffd83dbSDimitry Andric   unsigned CurrNodeLen = 0;
80*5ffd83dbSDimitry Andric   ToVisit.push_back({CurrNode, CurrNodeLen});
81*5ffd83dbSDimitry Andric   while (!ToVisit.empty()) {
82*5ffd83dbSDimitry Andric     std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
83*5ffd83dbSDimitry Andric     ToVisit.pop_back();
84*5ffd83dbSDimitry Andric     CurrNode->ConcatLen = CurrNodeLen;
85*5ffd83dbSDimitry Andric     for (auto &ChildPair : CurrNode->Children) {
86*5ffd83dbSDimitry Andric       assert(ChildPair.second && "Node had a null child!");
87*5ffd83dbSDimitry Andric       ToVisit.push_back(
88*5ffd83dbSDimitry Andric           {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
89*5ffd83dbSDimitry Andric     }
90*5ffd83dbSDimitry Andric 
91*5ffd83dbSDimitry Andric     // No children, so we are at the end of the string.
92*5ffd83dbSDimitry Andric     if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
93*5ffd83dbSDimitry Andric       CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
94*5ffd83dbSDimitry Andric   }
95*5ffd83dbSDimitry Andric }
96*5ffd83dbSDimitry Andric 
extend(unsigned EndIdx,unsigned SuffixesToAdd)97*5ffd83dbSDimitry Andric unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
98*5ffd83dbSDimitry Andric   SuffixTreeNode *NeedsLink = nullptr;
99*5ffd83dbSDimitry Andric 
100*5ffd83dbSDimitry Andric   while (SuffixesToAdd > 0) {
101*5ffd83dbSDimitry Andric 
102*5ffd83dbSDimitry Andric     // Are we waiting to add anything other than just the last character?
103*5ffd83dbSDimitry Andric     if (Active.Len == 0) {
104*5ffd83dbSDimitry Andric       // If not, then say the active index is the end index.
105*5ffd83dbSDimitry Andric       Active.Idx = EndIdx;
106*5ffd83dbSDimitry Andric     }
107*5ffd83dbSDimitry Andric 
108*5ffd83dbSDimitry Andric     assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
109*5ffd83dbSDimitry Andric 
110*5ffd83dbSDimitry Andric     // The first character in the current substring we're looking at.
111*5ffd83dbSDimitry Andric     unsigned FirstChar = Str[Active.Idx];
112*5ffd83dbSDimitry Andric 
113*5ffd83dbSDimitry Andric     // Have we inserted anything starting with FirstChar at the current node?
114*5ffd83dbSDimitry Andric     if (Active.Node->Children.count(FirstChar) == 0) {
115*5ffd83dbSDimitry Andric       // If not, then we can just insert a leaf and move to the next step.
116*5ffd83dbSDimitry Andric       insertLeaf(*Active.Node, EndIdx, FirstChar);
117*5ffd83dbSDimitry Andric 
118*5ffd83dbSDimitry Andric       // The active node is an internal node, and we visited it, so it must
119*5ffd83dbSDimitry Andric       // need a link if it doesn't have one.
120*5ffd83dbSDimitry Andric       if (NeedsLink) {
121*5ffd83dbSDimitry Andric         NeedsLink->Link = Active.Node;
122*5ffd83dbSDimitry Andric         NeedsLink = nullptr;
123*5ffd83dbSDimitry Andric       }
124*5ffd83dbSDimitry Andric     } else {
125*5ffd83dbSDimitry Andric       // There's a match with FirstChar, so look for the point in the tree to
126*5ffd83dbSDimitry Andric       // insert a new node.
127*5ffd83dbSDimitry Andric       SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
128*5ffd83dbSDimitry Andric 
129*5ffd83dbSDimitry Andric       unsigned SubstringLen = NextNode->size();
130*5ffd83dbSDimitry Andric 
131*5ffd83dbSDimitry Andric       // Is the current suffix we're trying to insert longer than the size of
132*5ffd83dbSDimitry Andric       // the child we want to move to?
133*5ffd83dbSDimitry Andric       if (Active.Len >= SubstringLen) {
134*5ffd83dbSDimitry Andric         // If yes, then consume the characters we've seen and move to the next
135*5ffd83dbSDimitry Andric         // node.
136*5ffd83dbSDimitry Andric         Active.Idx += SubstringLen;
137*5ffd83dbSDimitry Andric         Active.Len -= SubstringLen;
138*5ffd83dbSDimitry Andric         Active.Node = NextNode;
139*5ffd83dbSDimitry Andric         continue;
140*5ffd83dbSDimitry Andric       }
141*5ffd83dbSDimitry Andric 
142*5ffd83dbSDimitry Andric       // Otherwise, the suffix we're trying to insert must be contained in the
143*5ffd83dbSDimitry Andric       // next node we want to move to.
144*5ffd83dbSDimitry Andric       unsigned LastChar = Str[EndIdx];
145*5ffd83dbSDimitry Andric 
146*5ffd83dbSDimitry Andric       // Is the string we're trying to insert a substring of the next node?
147*5ffd83dbSDimitry Andric       if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
148*5ffd83dbSDimitry Andric         // If yes, then we're done for this step. Remember our insertion point
149*5ffd83dbSDimitry Andric         // and move to the next end index. At this point, we have an implicit
150*5ffd83dbSDimitry Andric         // suffix tree.
151*5ffd83dbSDimitry Andric         if (NeedsLink && !Active.Node->isRoot()) {
152*5ffd83dbSDimitry Andric           NeedsLink->Link = Active.Node;
153*5ffd83dbSDimitry Andric           NeedsLink = nullptr;
154*5ffd83dbSDimitry Andric         }
155*5ffd83dbSDimitry Andric 
156*5ffd83dbSDimitry Andric         Active.Len++;
157*5ffd83dbSDimitry Andric         break;
158*5ffd83dbSDimitry Andric       }
159*5ffd83dbSDimitry Andric 
160*5ffd83dbSDimitry Andric       // The string we're trying to insert isn't a substring of the next node,
161*5ffd83dbSDimitry Andric       // but matches up to a point. Split the node.
162*5ffd83dbSDimitry Andric       //
163*5ffd83dbSDimitry Andric       // For example, say we ended our search at a node n and we're trying to
164*5ffd83dbSDimitry Andric       // insert ABD. Then we'll create a new node s for AB, reduce n to just
165*5ffd83dbSDimitry Andric       // representing C, and insert a new leaf node l to represent d. This
166*5ffd83dbSDimitry Andric       // allows us to ensure that if n was a leaf, it remains a leaf.
167*5ffd83dbSDimitry Andric       //
168*5ffd83dbSDimitry Andric       //   | ABC  ---split--->  | AB
169*5ffd83dbSDimitry Andric       //   n                    s
170*5ffd83dbSDimitry Andric       //                     C / \ D
171*5ffd83dbSDimitry Andric       //                      n   l
172*5ffd83dbSDimitry Andric 
173*5ffd83dbSDimitry Andric       // The node s from the diagram
174*5ffd83dbSDimitry Andric       SuffixTreeNode *SplitNode =
175*5ffd83dbSDimitry Andric           insertInternalNode(Active.Node, NextNode->StartIdx,
176*5ffd83dbSDimitry Andric                              NextNode->StartIdx + Active.Len - 1, FirstChar);
177*5ffd83dbSDimitry Andric 
178*5ffd83dbSDimitry Andric       // Insert the new node representing the new substring into the tree as
179*5ffd83dbSDimitry Andric       // a child of the split node. This is the node l from the diagram.
180*5ffd83dbSDimitry Andric       insertLeaf(*SplitNode, EndIdx, LastChar);
181*5ffd83dbSDimitry Andric 
182*5ffd83dbSDimitry Andric       // Make the old node a child of the split node and update its start
183*5ffd83dbSDimitry Andric       // index. This is the node n from the diagram.
184*5ffd83dbSDimitry Andric       NextNode->StartIdx += Active.Len;
185*5ffd83dbSDimitry Andric       SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
186*5ffd83dbSDimitry Andric 
187*5ffd83dbSDimitry Andric       // SplitNode is an internal node, update the suffix link.
188*5ffd83dbSDimitry Andric       if (NeedsLink)
189*5ffd83dbSDimitry Andric         NeedsLink->Link = SplitNode;
190*5ffd83dbSDimitry Andric 
191*5ffd83dbSDimitry Andric       NeedsLink = SplitNode;
192*5ffd83dbSDimitry Andric     }
193*5ffd83dbSDimitry Andric 
194*5ffd83dbSDimitry Andric     // We've added something new to the tree, so there's one less suffix to
195*5ffd83dbSDimitry Andric     // add.
196*5ffd83dbSDimitry Andric     SuffixesToAdd--;
197*5ffd83dbSDimitry Andric 
198*5ffd83dbSDimitry Andric     if (Active.Node->isRoot()) {
199*5ffd83dbSDimitry Andric       if (Active.Len > 0) {
200*5ffd83dbSDimitry Andric         Active.Len--;
201*5ffd83dbSDimitry Andric         Active.Idx = EndIdx - SuffixesToAdd + 1;
202*5ffd83dbSDimitry Andric       }
203*5ffd83dbSDimitry Andric     } else {
204*5ffd83dbSDimitry Andric       // Start the next phase at the next smallest suffix.
205*5ffd83dbSDimitry Andric       Active.Node = Active.Node->Link;
206*5ffd83dbSDimitry Andric     }
207*5ffd83dbSDimitry Andric   }
208*5ffd83dbSDimitry Andric 
209*5ffd83dbSDimitry Andric   return SuffixesToAdd;
210*5ffd83dbSDimitry Andric }
211