1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a POSIX regular expression matcher. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/Regex.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/StringRef.h" 17 #include "llvm/ADT/Twine.h" 18 #include <string> 19 20 // Important this comes last because it defines "_REGEX_H_". At least on 21 // Darwin, if included before any header that (transitively) includes 22 // xlocale.h, this will cause trouble, because of missing regex-related types. 23 #include "regex_impl.h" 24 25 using namespace llvm; 26 27 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} 28 29 Regex::Regex(StringRef regex, unsigned Flags) { 30 unsigned flags = 0; 31 preg = new llvm_regex(); 32 preg->re_endp = regex.end(); 33 if (Flags & IgnoreCase) 34 flags |= REG_ICASE; 35 if (Flags & Newline) 36 flags |= REG_NEWLINE; 37 if (!(Flags & BasicRegex)) 38 flags |= REG_EXTENDED; 39 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 40 } 41 42 Regex::Regex(Regex &®ex) { 43 preg = regex.preg; 44 error = regex.error; 45 regex.preg = nullptr; 46 regex.error = REG_BADPAT; 47 } 48 49 Regex::~Regex() { 50 if (preg) { 51 llvm_regfree(preg); 52 delete preg; 53 } 54 } 55 56 bool Regex::isValid(std::string &Error) const { 57 if (!error) 58 return true; 59 60 size_t len = llvm_regerror(error, preg, nullptr, 0); 61 62 Error.resize(len - 1); 63 llvm_regerror(error, preg, &Error[0], len); 64 return false; 65 } 66 67 /// getNumMatches - In a valid regex, return the number of parenthesized 68 /// matches it contains. 69 unsigned Regex::getNumMatches() const { 70 return preg->re_nsub; 71 } 72 73 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ 74 if (error) 75 return false; 76 77 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 78 79 // pmatch needs to have at least one element. 80 SmallVector<llvm_regmatch_t, 8> pm; 81 pm.resize(nmatch > 0 ? nmatch : 1); 82 pm[0].rm_so = 0; 83 pm[0].rm_eo = String.size(); 84 85 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 86 87 if (rc == REG_NOMATCH) 88 return false; 89 if (rc != 0) { 90 // regexec can fail due to invalid pattern or running out of memory. 91 error = rc; 92 return false; 93 } 94 95 // There was a match. 96 97 if (Matches) { // match position requested 98 Matches->clear(); 99 100 for (unsigned i = 0; i != nmatch; ++i) { 101 if (pm[i].rm_so == -1) { 102 // this group didn't match 103 Matches->push_back(StringRef()); 104 continue; 105 } 106 assert(pm[i].rm_eo >= pm[i].rm_so); 107 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 108 pm[i].rm_eo-pm[i].rm_so)); 109 } 110 } 111 112 return true; 113 } 114 115 std::string Regex::sub(StringRef Repl, StringRef String, 116 std::string *Error) { 117 SmallVector<StringRef, 8> Matches; 118 119 // Reset error, if given. 120 if (Error && !Error->empty()) *Error = ""; 121 122 // Return the input if there was no match. 123 if (!match(String, &Matches)) 124 return String; 125 126 // Otherwise splice in the replacement string, starting with the prefix before 127 // the match. 128 std::string Res(String.begin(), Matches[0].begin()); 129 130 // Then the replacement string, honoring possible substitutions. 131 while (!Repl.empty()) { 132 // Skip to the next escape. 133 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 134 135 // Add the skipped substring. 136 Res += Split.first; 137 138 // Check for terminimation and trailing backslash. 139 if (Split.second.empty()) { 140 if (Repl.size() != Split.first.size() && 141 Error && Error->empty()) 142 *Error = "replacement string contained trailing backslash"; 143 break; 144 } 145 146 // Otherwise update the replacement string and interpret escapes. 147 Repl = Split.second; 148 149 // FIXME: We should have a StringExtras function for mapping C99 escapes. 150 switch (Repl[0]) { 151 // Treat all unrecognized characters as self-quoting. 152 default: 153 Res += Repl[0]; 154 Repl = Repl.substr(1); 155 break; 156 157 // Single character escapes. 158 case 't': 159 Res += '\t'; 160 Repl = Repl.substr(1); 161 break; 162 case 'n': 163 Res += '\n'; 164 Repl = Repl.substr(1); 165 break; 166 167 // Decimal escapes are backreferences. 168 case '0': case '1': case '2': case '3': case '4': 169 case '5': case '6': case '7': case '8': case '9': { 170 // Extract the backreference number. 171 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 172 Repl = Repl.substr(Ref.size()); 173 174 unsigned RefValue; 175 if (!Ref.getAsInteger(10, RefValue) && 176 RefValue < Matches.size()) 177 Res += Matches[RefValue]; 178 else if (Error && Error->empty()) 179 *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); 180 break; 181 } 182 } 183 } 184 185 // And finally the suffix. 186 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 187 188 return Res; 189 } 190 191 // These are the special characters matched in functions like "p_ere_exp". 192 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 193 194 bool Regex::isLiteralERE(StringRef Str) { 195 // Check for regex metacharacters. This list was derived from our regex 196 // implementation in regcomp.c and double checked against the POSIX extended 197 // regular expression specification. 198 return Str.find_first_of(RegexMetachars) == StringRef::npos; 199 } 200 201 std::string Regex::escape(StringRef String) { 202 std::string RegexStr; 203 for (unsigned i = 0, e = String.size(); i != e; ++i) { 204 if (strchr(RegexMetachars, String[i])) 205 RegexStr += '\\'; 206 RegexStr += String[i]; 207 } 208 209 return RegexStr; 210 } 211