1*bb677cacSAndrew Litteken //===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===//
2*bb677cacSAndrew Litteken //
3*bb677cacSAndrew Litteken // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bb677cacSAndrew Litteken // See https://llvm.org/LICENSE.txt for license information.
5*bb677cacSAndrew Litteken // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bb677cacSAndrew Litteken //
7*bb677cacSAndrew Litteken //===----------------------------------------------------------------------===//
8*bb677cacSAndrew Litteken 
9*bb677cacSAndrew Litteken #include "llvm/Support/SuffixTree.h"
10*bb677cacSAndrew Litteken #include "gtest/gtest.h"
11*bb677cacSAndrew Litteken #include <vector>
12*bb677cacSAndrew Litteken 
13*bb677cacSAndrew Litteken using namespace llvm;
14*bb677cacSAndrew Litteken 
15*bb677cacSAndrew Litteken namespace {
16*bb677cacSAndrew Litteken 
17*bb677cacSAndrew Litteken // Each example vector has a unique element at the end to represent the end of
18*bb677cacSAndrew Litteken // the string
19*bb677cacSAndrew Litteken 
20*bb677cacSAndrew Litteken // Tests that The SuffixTree finds a simple repetition of the substring {1, 2}
21*bb677cacSAndrew Litteken // {1, 2} twice in the provided string.
TEST(SuffixTreeTest,TestSingleRepetition)22*bb677cacSAndrew Litteken TEST(SuffixTreeTest, TestSingleRepetition) {
23*bb677cacSAndrew Litteken   std::vector<unsigned> SimpleRepetitionData = {1, 2, 1, 2, 3};
24*bb677cacSAndrew Litteken   SuffixTree ST(SimpleRepetitionData);
25*bb677cacSAndrew Litteken   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
26*bb677cacSAndrew Litteken   for (auto It = ST.begin(); It != ST.end(); It++)
27*bb677cacSAndrew Litteken     SubStrings.push_back(*It);
28*bb677cacSAndrew Litteken   ASSERT_EQ(SubStrings.size(), 1u);
29*bb677cacSAndrew Litteken   EXPECT_EQ(SubStrings[0].Length, 2u);
30*bb677cacSAndrew Litteken   EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u);
31*bb677cacSAndrew Litteken   for (unsigned StartIdx : SubStrings[0].StartIndices) {
32*bb677cacSAndrew Litteken     EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u);
33*bb677cacSAndrew Litteken     EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u);
34*bb677cacSAndrew Litteken   }
35*bb677cacSAndrew Litteken }
36*bb677cacSAndrew Litteken 
37*bb677cacSAndrew Litteken // Tests that the SuffixTree is able to find the substrings {1, 2, 3} at
38*bb677cacSAndrew Litteken // at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4.
39*bb677cacSAndrew Litteken // This test also serves as a flag for improvements to the suffix tree.
40*bb677cacSAndrew Litteken //
41*bb677cacSAndrew Litteken // FIXME: Right now, the longest repeated substring from a specific index is
42*bb677cacSAndrew Litteken // returned, this could be improved to return the longest repeated substring, as
43*bb677cacSAndrew Litteken // well as those that are smaller.
TEST(SuffixTreeTest,TestLongerRepetition)44*bb677cacSAndrew Litteken TEST(SuffixTreeTest, TestLongerRepetition) {
45*bb677cacSAndrew Litteken   std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4};
46*bb677cacSAndrew Litteken   SuffixTree ST(RepeatedRepetitionData);
47*bb677cacSAndrew Litteken   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
48*bb677cacSAndrew Litteken   for (auto It = ST.begin(); It != ST.end(); It++)
49*bb677cacSAndrew Litteken     SubStrings.push_back(*It);
50*bb677cacSAndrew Litteken   EXPECT_EQ(SubStrings.size(), 2u);
51*bb677cacSAndrew Litteken   unsigned Len;
52*bb677cacSAndrew Litteken   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
53*bb677cacSAndrew Litteken     Len = RS.Length;
54*bb677cacSAndrew Litteken     bool IsExpectedLen = (Len == 3u || Len == 2u);
55*bb677cacSAndrew Litteken     bool IsExpectedIndex;
56*bb677cacSAndrew Litteken     ASSERT_TRUE(IsExpectedLen);
57*bb677cacSAndrew Litteken 
58*bb677cacSAndrew Litteken     if (Len == 3u) {
59*bb677cacSAndrew Litteken       for (unsigned StartIdx : RS.StartIndices) {
60*bb677cacSAndrew Litteken         IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
61*bb677cacSAndrew Litteken         EXPECT_TRUE(IsExpectedIndex);
62*bb677cacSAndrew Litteken         EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
63*bb677cacSAndrew Litteken         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
64*bb677cacSAndrew Litteken         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
65*bb677cacSAndrew Litteken       }
66*bb677cacSAndrew Litteken     } else {
67*bb677cacSAndrew Litteken       for (unsigned StartIdx : RS.StartIndices) {
68*bb677cacSAndrew Litteken         IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u);
69*bb677cacSAndrew Litteken         EXPECT_TRUE(IsExpectedIndex);
70*bb677cacSAndrew Litteken         EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
71*bb677cacSAndrew Litteken         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
72*bb677cacSAndrew Litteken       }
73*bb677cacSAndrew Litteken     }
74*bb677cacSAndrew Litteken   }
75*bb677cacSAndrew Litteken }
76*bb677cacSAndrew Litteken 
77*bb677cacSAndrew Litteken // Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at
78*bb677cacSAndrew Litteken // indices 0 and 1.
79*bb677cacSAndrew Litteken //
80*bb677cacSAndrew Litteken // FIXME: Add support for detecting {1, 1} and {1, 1, 1}
TEST(SuffixTreeTest,TestSingleCharacterRepeat)81*bb677cacSAndrew Litteken TEST(SuffixTreeTest, TestSingleCharacterRepeat) {
82*bb677cacSAndrew Litteken   std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
83*bb677cacSAndrew Litteken   std::vector<unsigned>::iterator RRDIt, RRDIt2;
84*bb677cacSAndrew Litteken   SuffixTree ST(RepeatedRepetitionData);
85*bb677cacSAndrew Litteken   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
86*bb677cacSAndrew Litteken   for (auto It = ST.begin(); It != ST.end(); It++)
87*bb677cacSAndrew Litteken     SubStrings.push_back(*It);
88*bb677cacSAndrew Litteken   EXPECT_EQ(SubStrings.size(), 1u);
89*bb677cacSAndrew Litteken   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
90*bb677cacSAndrew Litteken     EXPECT_EQ(RS.StartIndices.size(),
91*bb677cacSAndrew Litteken               RepeatedRepetitionData.size() - RS.Length);
92*bb677cacSAndrew Litteken     for (unsigned StartIdx : SubStrings[0].StartIndices) {
93*bb677cacSAndrew Litteken       RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
94*bb677cacSAndrew Litteken       std::advance(RRDIt, StartIdx);
95*bb677cacSAndrew Litteken       std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
96*bb677cacSAndrew Litteken       ASSERT_TRUE(
97*bb677cacSAndrew Litteken           all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
98*bb677cacSAndrew Litteken                  [](unsigned Elt) { return Elt == 1; }));
99*bb677cacSAndrew Litteken     }
100*bb677cacSAndrew Litteken   }
101*bb677cacSAndrew Litteken }
102*bb677cacSAndrew Litteken 
103*bb677cacSAndrew Litteken // The suffix tree cannot currently find repeated substrings in strings of the
104*bb677cacSAndrew Litteken // form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem
105*bb677cacSAndrew Litteken // repeats")
106*bb677cacSAndrew Litteken //
107*bb677cacSAndrew Litteken // FIXME: Teach the SuffixTree to recognize these cases.
TEST(SuffixTreeTest,TestTandemRepeat)108*bb677cacSAndrew Litteken TEST(SuffixTreeTest, TestTandemRepeat) {
109*bb677cacSAndrew Litteken   std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3};
110*bb677cacSAndrew Litteken   SuffixTree ST(RepeatedRepetitionData);
111*bb677cacSAndrew Litteken   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
112*bb677cacSAndrew Litteken   for (auto It = ST.begin(); It != ST.end(); It++)
113*bb677cacSAndrew Litteken     SubStrings.push_back(*It);
114*bb677cacSAndrew Litteken   EXPECT_EQ(SubStrings.size(), 0u);
115*bb677cacSAndrew Litteken }
116*bb677cacSAndrew Litteken 
117*bb677cacSAndrew Litteken // Tests that the SuffixTree does not erroneously include values that are not
118*bb677cacSAndrew Litteken // in repeated substrings.  That is, only finds {1, 1} at indices 0 and 3 and
119*bb677cacSAndrew Litteken // does not include 2 and 3.
TEST(SuffixTreeTest,TestExclusion)120*bb677cacSAndrew Litteken TEST(SuffixTreeTest, TestExclusion) {
121*bb677cacSAndrew Litteken   std::vector<unsigned> RepeatedRepetitionData = {1, 1, 2, 1, 1, 3};
122*bb677cacSAndrew Litteken   std::vector<unsigned>::iterator RRDIt, RRDIt2;
123*bb677cacSAndrew Litteken   SuffixTree ST(RepeatedRepetitionData);
124*bb677cacSAndrew Litteken   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
125*bb677cacSAndrew Litteken   for (auto It = ST.begin(); It != ST.end(); It++)
126*bb677cacSAndrew Litteken     SubStrings.push_back(*It);
127*bb677cacSAndrew Litteken   EXPECT_EQ(SubStrings.size(), 1u);
128*bb677cacSAndrew Litteken   bool IsExpectedIndex;
129*bb677cacSAndrew Litteken   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
130*bb677cacSAndrew Litteken     for (unsigned StartIdx : RS.StartIndices) {
131*bb677cacSAndrew Litteken       IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
132*bb677cacSAndrew Litteken       EXPECT_TRUE(IsExpectedIndex);
133*bb677cacSAndrew Litteken       RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
134*bb677cacSAndrew Litteken       std::advance(RRDIt, StartIdx);
135*bb677cacSAndrew Litteken       std::advance(RRDIt2, StartIdx + RS.Length);
136*bb677cacSAndrew Litteken       EXPECT_TRUE(
137*bb677cacSAndrew Litteken           all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
138*bb677cacSAndrew Litteken                  [](unsigned Elt) { return Elt == 1; }));
139*bb677cacSAndrew Litteken     }
140*bb677cacSAndrew Litteken   }
141*bb677cacSAndrew Litteken }
142*bb677cacSAndrew Litteken 
143*bb677cacSAndrew Litteken } // namespace
144