1 //===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "llvm/Support/SuffixTree.h"
10 #include "gtest/gtest.h"
11 #include <vector>
12 
13 using namespace llvm;
14 
15 namespace {
16 
17 // Each example vector has a unique element at the end to represent the end of
18 // the string
19 
20 // Tests that The SuffixTree finds a simple repetition of the substring {1, 2}
21 // {1, 2} twice in the provided string.
22 TEST(SuffixTreeTest, TestSingleRepetition) {
23   std::vector<unsigned> SimpleRepetitionData = {1, 2, 1, 2, 3};
24   SuffixTree ST(SimpleRepetitionData);
25   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
26   for (auto It = ST.begin(); It != ST.end(); It++)
27     SubStrings.push_back(*It);
28   ASSERT_EQ(SubStrings.size(), 1u);
29   EXPECT_EQ(SubStrings[0].Length, 2u);
30   EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u);
31   for (unsigned StartIdx : SubStrings[0].StartIndices) {
32     EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u);
33     EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u);
34   }
35 }
36 
37 // Tests that the SuffixTree is able to find the substrings {1, 2, 3} at
38 // at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4.
39 // This test also serves as a flag for improvements to the suffix tree.
40 //
41 // FIXME: Right now, the longest repeated substring from a specific index is
42 // returned, this could be improved to return the longest repeated substring, as
43 // well as those that are smaller.
44 TEST(SuffixTreeTest, TestLongerRepetition) {
45   std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4};
46   SuffixTree ST(RepeatedRepetitionData);
47   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
48   for (auto It = ST.begin(); It != ST.end(); It++)
49     SubStrings.push_back(*It);
50   EXPECT_EQ(SubStrings.size(), 2u);
51   unsigned Len;
52   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
53     Len = RS.Length;
54     bool IsExpectedLen = (Len == 3u || Len == 2u);
55     bool IsExpectedIndex;
56     ASSERT_TRUE(IsExpectedLen);
57 
58     if (Len == 3u) {
59       for (unsigned StartIdx : RS.StartIndices) {
60         IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
61         EXPECT_TRUE(IsExpectedIndex);
62         EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
63         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
64         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
65       }
66     } else {
67       for (unsigned StartIdx : RS.StartIndices) {
68         IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u);
69         EXPECT_TRUE(IsExpectedIndex);
70         EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
71         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
72       }
73     }
74   }
75 }
76 
77 // Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at
78 // indices 0 and 1.
79 //
80 // FIXME: Add support for detecting {1, 1} and {1, 1, 1}
81 TEST(SuffixTreeTest, TestSingleCharacterRepeat) {
82   std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
83   std::vector<unsigned>::iterator RRDIt, RRDIt2;
84   SuffixTree ST(RepeatedRepetitionData);
85   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
86   for (auto It = ST.begin(); It != ST.end(); It++)
87     SubStrings.push_back(*It);
88   EXPECT_EQ(SubStrings.size(), 1u);
89   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
90     EXPECT_EQ(RS.StartIndices.size(),
91               RepeatedRepetitionData.size() - RS.Length);
92     for (unsigned StartIdx : SubStrings[0].StartIndices) {
93       RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
94       std::advance(RRDIt, StartIdx);
95       std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
96       ASSERT_TRUE(
97           all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
98                  [](unsigned Elt) { return Elt == 1; }));
99     }
100   }
101 }
102 
103 // The suffix tree cannot currently find repeated substrings in strings of the
104 // form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem
105 // repeats")
106 //
107 // FIXME: Teach the SuffixTree to recognize these cases.
108 TEST(SuffixTreeTest, TestTandemRepeat) {
109   std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3};
110   SuffixTree ST(RepeatedRepetitionData);
111   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
112   for (auto It = ST.begin(); It != ST.end(); It++)
113     SubStrings.push_back(*It);
114   EXPECT_EQ(SubStrings.size(), 0u);
115 }
116 
117 // Tests that the SuffixTree does not erroneously include values that are not
118 // in repeated substrings.  That is, only finds {1, 1} at indices 0 and 3 and
119 // does not include 2 and 3.
120 TEST(SuffixTreeTest, TestExclusion) {
121   std::vector<unsigned> RepeatedRepetitionData = {1, 1, 2, 1, 1, 3};
122   std::vector<unsigned>::iterator RRDIt, RRDIt2;
123   SuffixTree ST(RepeatedRepetitionData);
124   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
125   for (auto It = ST.begin(); It != ST.end(); It++)
126     SubStrings.push_back(*It);
127   EXPECT_EQ(SubStrings.size(), 1u);
128   bool IsExpectedIndex;
129   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
130     for (unsigned StartIdx : RS.StartIndices) {
131       IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
132       EXPECT_TRUE(IsExpectedIndex);
133       RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
134       std::advance(RRDIt, StartIdx);
135       std::advance(RRDIt2, StartIdx + RS.Length);
136       EXPECT_TRUE(
137           all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
138                  [](unsigned Elt) { return Elt == 1; }));
139     }
140   }
141 }
142 
143 } // namespace
144