1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/ADT/StringRef.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/edit_distance.h" 14 #include <bitset> 15 16 using namespace llvm; 17 18 // MSVC emits references to this into the translation units which reference it. 19 #ifndef _MSC_VER 20 const size_t StringRef::npos; 21 #endif 22 23 static char ascii_tolower(char x) { 24 if (x >= 'A' && x <= 'Z') 25 return x - 'A' + 'a'; 26 return x; 27 } 28 29 static char ascii_toupper(char x) { 30 if (x >= 'a' && x <= 'z') 31 return x - 'a' + 'A'; 32 return x; 33 } 34 35 static bool ascii_isdigit(char x) { 36 return x >= '0' && x <= '9'; 37 } 38 39 // strncasecmp() is not available on non-POSIX systems, so define an 40 // alternative function here. 41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 42 for (size_t I = 0; I < Length; ++I) { 43 unsigned char LHC = ascii_tolower(LHS[I]); 44 unsigned char RHC = ascii_tolower(RHS[I]); 45 if (LHC != RHC) 46 return LHC < RHC ? -1 : 1; 47 } 48 return 0; 49 } 50 51 /// compare_lower - Compare strings, ignoring case. 52 int StringRef::compare_lower(StringRef RHS) const { 53 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 54 return Res; 55 if (Length == RHS.Length) 56 return 0; 57 return Length < RHS.Length ? -1 : 1; 58 } 59 60 /// Check if this string starts with the given \p Prefix, ignoring case. 61 bool StringRef::startswith_lower(StringRef Prefix) const { 62 return Length >= Prefix.Length && 63 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 64 } 65 66 /// Check if this string ends with the given \p Suffix, ignoring case. 67 bool StringRef::endswith_lower(StringRef Suffix) const { 68 return Length >= Suffix.Length && 69 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 70 } 71 72 /// compare_numeric - Compare strings, handle embedded numbers. 73 int StringRef::compare_numeric(StringRef RHS) const { 74 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 75 // Check for sequences of digits. 76 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 77 // The longer sequence of numbers is considered larger. 78 // This doesn't really handle prefixed zeros well. 79 size_t J; 80 for (J = I + 1; J != E + 1; ++J) { 81 bool ld = J < Length && ascii_isdigit(Data[J]); 82 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 83 if (ld != rd) 84 return rd ? -1 : 1; 85 if (!rd) 86 break; 87 } 88 // The two number sequences have the same length (J-I), just memcmp them. 89 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 90 return Res < 0 ? -1 : 1; 91 // Identical number sequences, continue search after the numbers. 92 I = J - 1; 93 continue; 94 } 95 if (Data[I] != RHS.Data[I]) 96 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 97 } 98 if (Length == RHS.Length) 99 return 0; 100 return Length < RHS.Length ? -1 : 1; 101 } 102 103 // Compute the edit distance between the two given strings. 104 unsigned StringRef::edit_distance(llvm::StringRef Other, 105 bool AllowReplacements, 106 unsigned MaxEditDistance) const { 107 return llvm::ComputeEditDistance( 108 makeArrayRef(data(), size()), 109 makeArrayRef(Other.data(), Other.size()), 110 AllowReplacements, MaxEditDistance); 111 } 112 113 //===----------------------------------------------------------------------===// 114 // String Operations 115 //===----------------------------------------------------------------------===// 116 117 std::string StringRef::lower() const { 118 std::string Result(size(), char()); 119 for (size_type i = 0, e = size(); i != e; ++i) { 120 Result[i] = ascii_tolower(Data[i]); 121 } 122 return Result; 123 } 124 125 std::string StringRef::upper() const { 126 std::string Result(size(), char()); 127 for (size_type i = 0, e = size(); i != e; ++i) { 128 Result[i] = ascii_toupper(Data[i]); 129 } 130 return Result; 131 } 132 133 //===----------------------------------------------------------------------===// 134 // String Searching 135 //===----------------------------------------------------------------------===// 136 137 138 /// find - Search for the first string \arg Str in the string. 139 /// 140 /// \return - The index of the first occurrence of \arg Str, or npos if not 141 /// found. 142 size_t StringRef::find(StringRef Str, size_t From) const { 143 size_t N = Str.size(); 144 if (N > Length) 145 return npos; 146 147 // For short haystacks or unsupported needles fall back to the naive algorithm 148 if (Length < 16 || N > 255 || N == 0) { 149 for (size_t e = Length - N + 1, i = std::min(From, e); i != e; ++i) 150 if (substr(i, N).equals(Str)) 151 return i; 152 return npos; 153 } 154 155 if (From >= Length) 156 return npos; 157 158 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 159 uint8_t BadCharSkip[256]; 160 std::memset(BadCharSkip, N, 256); 161 for (unsigned i = 0; i != N-1; ++i) 162 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 163 164 unsigned Len = Length-From, Pos = From; 165 while (Len >= N) { 166 if (substr(Pos, N).equals(Str)) // See if this is the correct substring. 167 return Pos; 168 169 // Otherwise skip the appropriate number of bytes. 170 uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; 171 Len -= Skip; 172 Pos += Skip; 173 } 174 175 return npos; 176 } 177 178 /// rfind - Search for the last string \arg Str in the string. 179 /// 180 /// \return - The index of the last occurrence of \arg Str, or npos if not 181 /// found. 182 size_t StringRef::rfind(StringRef Str) const { 183 size_t N = Str.size(); 184 if (N > Length) 185 return npos; 186 for (size_t i = Length - N + 1, e = 0; i != e;) { 187 --i; 188 if (substr(i, N).equals(Str)) 189 return i; 190 } 191 return npos; 192 } 193 194 /// find_first_of - Find the first character in the string that is in \arg 195 /// Chars, or npos if not found. 196 /// 197 /// Note: O(size() + Chars.size()) 198 StringRef::size_type StringRef::find_first_of(StringRef Chars, 199 size_t From) const { 200 std::bitset<1 << CHAR_BIT> CharBits; 201 for (size_type i = 0; i != Chars.size(); ++i) 202 CharBits.set((unsigned char)Chars[i]); 203 204 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 205 if (CharBits.test((unsigned char)Data[i])) 206 return i; 207 return npos; 208 } 209 210 /// find_first_not_of - Find the first character in the string that is not 211 /// \arg C or npos if not found. 212 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 213 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 214 if (Data[i] != C) 215 return i; 216 return npos; 217 } 218 219 /// find_first_not_of - Find the first character in the string that is not 220 /// in the string \arg Chars, or npos if not found. 221 /// 222 /// Note: O(size() + Chars.size()) 223 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 224 size_t From) const { 225 std::bitset<1 << CHAR_BIT> CharBits; 226 for (size_type i = 0; i != Chars.size(); ++i) 227 CharBits.set((unsigned char)Chars[i]); 228 229 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 230 if (!CharBits.test((unsigned char)Data[i])) 231 return i; 232 return npos; 233 } 234 235 /// find_last_of - Find the last character in the string that is in \arg C, 236 /// or npos if not found. 237 /// 238 /// Note: O(size() + Chars.size()) 239 StringRef::size_type StringRef::find_last_of(StringRef Chars, 240 size_t From) const { 241 std::bitset<1 << CHAR_BIT> CharBits; 242 for (size_type i = 0; i != Chars.size(); ++i) 243 CharBits.set((unsigned char)Chars[i]); 244 245 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 246 if (CharBits.test((unsigned char)Data[i])) 247 return i; 248 return npos; 249 } 250 251 /// find_last_not_of - Find the last character in the string that is not 252 /// \arg C, or npos if not found. 253 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 254 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 255 if (Data[i] != C) 256 return i; 257 return npos; 258 } 259 260 /// find_last_not_of - Find the last character in the string that is not in 261 /// \arg Chars, or npos if not found. 262 /// 263 /// Note: O(size() + Chars.size()) 264 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 265 size_t From) const { 266 std::bitset<1 << CHAR_BIT> CharBits; 267 for (size_type i = 0, e = Chars.size(); i != e; ++i) 268 CharBits.set((unsigned char)Chars[i]); 269 270 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 271 if (!CharBits.test((unsigned char)Data[i])) 272 return i; 273 return npos; 274 } 275 276 void StringRef::split(SmallVectorImpl<StringRef> &A, 277 StringRef Separators, int MaxSplit, 278 bool KeepEmpty) const { 279 StringRef rest = *this; 280 281 // rest.data() is used to distinguish cases like "a," that splits into 282 // "a" + "" and "a" that splits into "a" + 0. 283 for (int splits = 0; 284 rest.data() != nullptr && (MaxSplit < 0 || splits < MaxSplit); 285 ++splits) { 286 std::pair<StringRef, StringRef> p = rest.split(Separators); 287 288 if (KeepEmpty || p.first.size() != 0) 289 A.push_back(p.first); 290 rest = p.second; 291 } 292 // If we have a tail left, add it. 293 if (rest.data() != nullptr && (rest.size() != 0 || KeepEmpty)) 294 A.push_back(rest); 295 } 296 297 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 298 int MaxSplit, bool KeepEmpty) const { 299 StringRef rest = *this; 300 301 // rest.data() is used to distinguish cases like "a," that splits into 302 // "a" + "" and "a" that splits into "a" + 0. 303 for (int splits = 0; 304 rest.data() != nullptr && (MaxSplit < 0 || splits < MaxSplit); 305 ++splits) { 306 std::pair<StringRef, StringRef> p = rest.split(Separator); 307 308 if (KeepEmpty || p.first.size() != 0) 309 A.push_back(p.first); 310 rest = p.second; 311 } 312 // If we have a tail left, add it. 313 if (rest.data() != nullptr && (rest.size() != 0 || KeepEmpty)) 314 A.push_back(rest); 315 } 316 317 //===----------------------------------------------------------------------===// 318 // Helpful Algorithms 319 //===----------------------------------------------------------------------===// 320 321 /// count - Return the number of non-overlapped occurrences of \arg Str in 322 /// the string. 323 size_t StringRef::count(StringRef Str) const { 324 size_t Count = 0; 325 size_t N = Str.size(); 326 if (N > Length) 327 return 0; 328 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 329 if (substr(i, N).equals(Str)) 330 ++Count; 331 return Count; 332 } 333 334 static unsigned GetAutoSenseRadix(StringRef &Str) { 335 if (Str.startswith("0x")) { 336 Str = Str.substr(2); 337 return 16; 338 } 339 340 if (Str.startswith("0b")) { 341 Str = Str.substr(2); 342 return 2; 343 } 344 345 if (Str.startswith("0o")) { 346 Str = Str.substr(2); 347 return 8; 348 } 349 350 if (Str.startswith("0")) 351 return 8; 352 353 return 10; 354 } 355 356 357 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 358 /// sequence of radix up to 36 to an unsigned long long value. 359 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 360 unsigned long long &Result) { 361 // Autosense radix if not specified. 362 if (Radix == 0) 363 Radix = GetAutoSenseRadix(Str); 364 365 // Empty strings (after the radix autosense) are invalid. 366 if (Str.empty()) return true; 367 368 // Parse all the bytes of the string given this radix. Watch for overflow. 369 Result = 0; 370 while (!Str.empty()) { 371 unsigned CharVal; 372 if (Str[0] >= '0' && Str[0] <= '9') 373 CharVal = Str[0]-'0'; 374 else if (Str[0] >= 'a' && Str[0] <= 'z') 375 CharVal = Str[0]-'a'+10; 376 else if (Str[0] >= 'A' && Str[0] <= 'Z') 377 CharVal = Str[0]-'A'+10; 378 else 379 return true; 380 381 // If the parsed value is larger than the integer radix, the string is 382 // invalid. 383 if (CharVal >= Radix) 384 return true; 385 386 // Add in this character. 387 unsigned long long PrevResult = Result; 388 Result = Result*Radix+CharVal; 389 390 // Check for overflow by shifting back and seeing if bits were lost. 391 if (Result/Radix < PrevResult) 392 return true; 393 394 Str = Str.substr(1); 395 } 396 397 return false; 398 } 399 400 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 401 long long &Result) { 402 unsigned long long ULLVal; 403 404 // Handle positive strings first. 405 if (Str.empty() || Str.front() != '-') { 406 if (getAsUnsignedInteger(Str, Radix, ULLVal) || 407 // Check for value so large it overflows a signed value. 408 (long long)ULLVal < 0) 409 return true; 410 Result = ULLVal; 411 return false; 412 } 413 414 // Get the positive part of the value. 415 if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || 416 // Reject values so large they'd overflow as negative signed, but allow 417 // "-0". This negates the unsigned so that the negative isn't undefined 418 // on signed overflow. 419 (long long)-ULLVal > 0) 420 return true; 421 422 Result = -ULLVal; 423 return false; 424 } 425 426 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 427 StringRef Str = *this; 428 429 // Autosense radix if not specified. 430 if (Radix == 0) 431 Radix = GetAutoSenseRadix(Str); 432 433 assert(Radix > 1 && Radix <= 36); 434 435 // Empty strings (after the radix autosense) are invalid. 436 if (Str.empty()) return true; 437 438 // Skip leading zeroes. This can be a significant improvement if 439 // it means we don't need > 64 bits. 440 while (!Str.empty() && Str.front() == '0') 441 Str = Str.substr(1); 442 443 // If it was nothing but zeroes.... 444 if (Str.empty()) { 445 Result = APInt(64, 0); 446 return false; 447 } 448 449 // (Over-)estimate the required number of bits. 450 unsigned Log2Radix = 0; 451 while ((1U << Log2Radix) < Radix) Log2Radix++; 452 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 453 454 unsigned BitWidth = Log2Radix * Str.size(); 455 if (BitWidth < Result.getBitWidth()) 456 BitWidth = Result.getBitWidth(); // don't shrink the result 457 else if (BitWidth > Result.getBitWidth()) 458 Result = Result.zext(BitWidth); 459 460 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 461 if (!IsPowerOf2Radix) { 462 // These must have the same bit-width as Result. 463 RadixAP = APInt(BitWidth, Radix); 464 CharAP = APInt(BitWidth, 0); 465 } 466 467 // Parse all the bytes of the string given this radix. 468 Result = 0; 469 while (!Str.empty()) { 470 unsigned CharVal; 471 if (Str[0] >= '0' && Str[0] <= '9') 472 CharVal = Str[0]-'0'; 473 else if (Str[0] >= 'a' && Str[0] <= 'z') 474 CharVal = Str[0]-'a'+10; 475 else if (Str[0] >= 'A' && Str[0] <= 'Z') 476 CharVal = Str[0]-'A'+10; 477 else 478 return true; 479 480 // If the parsed value is larger than the integer radix, the string is 481 // invalid. 482 if (CharVal >= Radix) 483 return true; 484 485 // Add in this character. 486 if (IsPowerOf2Radix) { 487 Result <<= Log2Radix; 488 Result |= CharVal; 489 } else { 490 Result *= RadixAP; 491 CharAP = CharVal; 492 Result += CharAP; 493 } 494 495 Str = Str.substr(1); 496 } 497 498 return false; 499 } 500 501 502 // Implementation of StringRef hashing. 503 hash_code llvm::hash_value(StringRef S) { 504 return hash_combine_range(S.begin(), S.end()); 505 } 506