1 //===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Support/SuffixTree.h" 10 #include "gtest/gtest.h" 11 #include <vector> 12 13 using namespace llvm; 14 15 namespace { 16 17 // Each example vector has a unique element at the end to represent the end of 18 // the string 19 20 // Tests that The SuffixTree finds a simple repetition of the substring {1, 2} 21 // {1, 2} twice in the provided string. 22 TEST(SuffixTreeTest, TestSingleRepetition) { 23 std::vector<unsigned> SimpleRepetitionData = {1, 2, 1, 2, 3}; 24 SuffixTree ST(SimpleRepetitionData); 25 std::vector<SuffixTree::RepeatedSubstring> SubStrings; 26 for (auto It = ST.begin(); It != ST.end(); It++) 27 SubStrings.push_back(*It); 28 ASSERT_EQ(SubStrings.size(), 1u); 29 EXPECT_EQ(SubStrings[0].Length, 2u); 30 EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u); 31 for (unsigned StartIdx : SubStrings[0].StartIndices) { 32 EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u); 33 EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u); 34 } 35 } 36 37 // Tests that the SuffixTree is able to find the substrings {1, 2, 3} at 38 // at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4. 39 // This test also serves as a flag for improvements to the suffix tree. 40 // 41 // FIXME: Right now, the longest repeated substring from a specific index is 42 // returned, this could be improved to return the longest repeated substring, as 43 // well as those that are smaller. 44 TEST(SuffixTreeTest, TestLongerRepetition) { 45 std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4}; 46 SuffixTree ST(RepeatedRepetitionData); 47 std::vector<SuffixTree::RepeatedSubstring> SubStrings; 48 for (auto It = ST.begin(); It != ST.end(); It++) 49 SubStrings.push_back(*It); 50 EXPECT_EQ(SubStrings.size(), 2u); 51 unsigned Len; 52 for (SuffixTree::RepeatedSubstring &RS : SubStrings) { 53 Len = RS.Length; 54 bool IsExpectedLen = (Len == 3u || Len == 2u); 55 bool IsExpectedIndex; 56 ASSERT_TRUE(IsExpectedLen); 57 58 if (Len == 3u) { 59 for (unsigned StartIdx : RS.StartIndices) { 60 IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u); 61 EXPECT_TRUE(IsExpectedIndex); 62 EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u); 63 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u); 64 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u); 65 } 66 } else { 67 for (unsigned StartIdx : RS.StartIndices) { 68 IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u); 69 EXPECT_TRUE(IsExpectedIndex); 70 EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u); 71 EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u); 72 } 73 } 74 } 75 } 76 77 // Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at 78 // indices 0 and 1. 79 // 80 // FIXME: Add support for detecting {1, 1} and {1, 1, 1} 81 TEST(SuffixTreeTest, TestSingleCharacterRepeat) { 82 std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2}; 83 std::vector<unsigned>::iterator RRDIt, RRDIt2; 84 SuffixTree ST(RepeatedRepetitionData); 85 std::vector<SuffixTree::RepeatedSubstring> SubStrings; 86 for (auto It = ST.begin(); It != ST.end(); It++) 87 SubStrings.push_back(*It); 88 EXPECT_EQ(SubStrings.size(), 1u); 89 for (SuffixTree::RepeatedSubstring &RS : SubStrings) { 90 EXPECT_EQ(RS.StartIndices.size(), 91 RepeatedRepetitionData.size() - RS.Length); 92 for (unsigned StartIdx : SubStrings[0].StartIndices) { 93 RRDIt = RRDIt2 = RepeatedRepetitionData.begin(); 94 std::advance(RRDIt, StartIdx); 95 std::advance(RRDIt2, StartIdx + SubStrings[0].Length); 96 ASSERT_TRUE( 97 all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2), 98 [](unsigned Elt) { return Elt == 1; })); 99 } 100 } 101 } 102 103 // The suffix tree cannot currently find repeated substrings in strings of the 104 // form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem 105 // repeats") 106 // 107 // FIXME: Teach the SuffixTree to recognize these cases. 108 TEST(SuffixTreeTest, TestTandemRepeat) { 109 std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3}; 110 SuffixTree ST(RepeatedRepetitionData); 111 std::vector<SuffixTree::RepeatedSubstring> SubStrings; 112 for (auto It = ST.begin(); It != ST.end(); It++) 113 SubStrings.push_back(*It); 114 EXPECT_EQ(SubStrings.size(), 0u); 115 } 116 117 // Tests that the SuffixTree does not erroneously include values that are not 118 // in repeated substrings. That is, only finds {1, 1} at indices 0 and 3 and 119 // does not include 2 and 3. 120 TEST(SuffixTreeTest, TestExclusion) { 121 std::vector<unsigned> RepeatedRepetitionData = {1, 1, 2, 1, 1, 3}; 122 std::vector<unsigned>::iterator RRDIt, RRDIt2; 123 SuffixTree ST(RepeatedRepetitionData); 124 std::vector<SuffixTree::RepeatedSubstring> SubStrings; 125 for (auto It = ST.begin(); It != ST.end(); It++) 126 SubStrings.push_back(*It); 127 EXPECT_EQ(SubStrings.size(), 1u); 128 bool IsExpectedIndex; 129 for (SuffixTree::RepeatedSubstring &RS : SubStrings) { 130 for (unsigned StartIdx : RS.StartIndices) { 131 IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u); 132 EXPECT_TRUE(IsExpectedIndex); 133 RRDIt = RRDIt2 = RepeatedRepetitionData.begin(); 134 std::advance(RRDIt, StartIdx); 135 std::advance(RRDIt2, StartIdx + RS.Length); 136 EXPECT_TRUE( 137 all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2), 138 [](unsigned Elt) { return Elt == 1; })); 139 } 140 } 141 } 142 143 } // namespace 144