1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/ADT/StringRef.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/edit_distance.h" 14 #include <bitset> 15 16 using namespace llvm; 17 18 // MSVC emits references to this into the translation units which reference it. 19 #ifndef _MSC_VER 20 const size_t StringRef::npos; 21 #endif 22 23 static char ascii_tolower(char x) { 24 if (x >= 'A' && x <= 'Z') 25 return x - 'A' + 'a'; 26 return x; 27 } 28 29 static char ascii_toupper(char x) { 30 if (x >= 'a' && x <= 'z') 31 return x - 'a' + 'A'; 32 return x; 33 } 34 35 static bool ascii_isdigit(char x) { 36 return x >= '0' && x <= '9'; 37 } 38 39 // strncasecmp() is not available on non-POSIX systems, so define an 40 // alternative function here. 41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 42 for (size_t I = 0; I < Length; ++I) { 43 unsigned char LHC = ascii_tolower(LHS[I]); 44 unsigned char RHC = ascii_tolower(RHS[I]); 45 if (LHC != RHC) 46 return LHC < RHC ? -1 : 1; 47 } 48 return 0; 49 } 50 51 /// compare_lower - Compare strings, ignoring case. 52 int StringRef::compare_lower(StringRef RHS) const { 53 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 54 return Res; 55 if (Length == RHS.Length) 56 return 0; 57 return Length < RHS.Length ? -1 : 1; 58 } 59 60 /// Check if this string starts with the given \p Prefix, ignoring case. 61 bool StringRef::startswith_lower(StringRef Prefix) const { 62 return Length >= Prefix.Length && 63 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 64 } 65 66 /// Check if this string ends with the given \p Suffix, ignoring case. 67 bool StringRef::endswith_lower(StringRef Suffix) const { 68 return Length >= Suffix.Length && 69 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 70 } 71 72 /// compare_numeric - Compare strings, handle embedded numbers. 73 int StringRef::compare_numeric(StringRef RHS) const { 74 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 75 // Check for sequences of digits. 76 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 77 // The longer sequence of numbers is considered larger. 78 // This doesn't really handle prefixed zeros well. 79 size_t J; 80 for (J = I + 1; J != E + 1; ++J) { 81 bool ld = J < Length && ascii_isdigit(Data[J]); 82 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 83 if (ld != rd) 84 return rd ? -1 : 1; 85 if (!rd) 86 break; 87 } 88 // The two number sequences have the same length (J-I), just memcmp them. 89 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 90 return Res < 0 ? -1 : 1; 91 // Identical number sequences, continue search after the numbers. 92 I = J - 1; 93 continue; 94 } 95 if (Data[I] != RHS.Data[I]) 96 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 97 } 98 if (Length == RHS.Length) 99 return 0; 100 return Length < RHS.Length ? -1 : 1; 101 } 102 103 // Compute the edit distance between the two given strings. 104 unsigned StringRef::edit_distance(llvm::StringRef Other, 105 bool AllowReplacements, 106 unsigned MaxEditDistance) const { 107 return llvm::ComputeEditDistance( 108 makeArrayRef(data(), size()), 109 makeArrayRef(Other.data(), Other.size()), 110 AllowReplacements, MaxEditDistance); 111 } 112 113 //===----------------------------------------------------------------------===// 114 // String Operations 115 //===----------------------------------------------------------------------===// 116 117 std::string StringRef::lower() const { 118 std::string Result(size(), char()); 119 for (size_type i = 0, e = size(); i != e; ++i) { 120 Result[i] = ascii_tolower(Data[i]); 121 } 122 return Result; 123 } 124 125 std::string StringRef::upper() const { 126 std::string Result(size(), char()); 127 for (size_type i = 0, e = size(); i != e; ++i) { 128 Result[i] = ascii_toupper(Data[i]); 129 } 130 return Result; 131 } 132 133 //===----------------------------------------------------------------------===// 134 // String Searching 135 //===----------------------------------------------------------------------===// 136 137 138 /// find - Search for the first string \arg Str in the string. 139 /// 140 /// \return - The index of the first occurrence of \arg Str, or npos if not 141 /// found. 142 size_t StringRef::find(StringRef Str, size_t From) const { 143 if (From > Length) 144 return npos; 145 146 const char *Needle = Str.data(); 147 size_t N = Str.size(); 148 if (N == 0) 149 return From; 150 151 size_t Size = Length - From; 152 if (Size < N) 153 return npos; 154 155 const char *Start = Data + From; 156 const char *Stop = Start + (Size - N + 1); 157 158 // For short haystacks or unsupported needles fall back to the naive algorithm 159 if (Size < 16 || N > 255) { 160 do { 161 if (std::memcmp(Start, Needle, N) == 0) 162 return Start - Data; 163 ++Start; 164 } while (Start < Stop); 165 return npos; 166 } 167 168 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 169 uint8_t BadCharSkip[256]; 170 std::memset(BadCharSkip, N, 256); 171 for (unsigned i = 0; i != N-1; ++i) 172 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 173 174 do { 175 if (std::memcmp(Start, Needle, N) == 0) 176 return Start - Data; 177 178 // Otherwise skip the appropriate number of bytes. 179 Start += BadCharSkip[(uint8_t)Start[N-1]]; 180 } while (Start < Stop); 181 182 return npos; 183 } 184 185 /// rfind - Search for the last string \arg Str in the string. 186 /// 187 /// \return - The index of the last occurrence of \arg Str, or npos if not 188 /// found. 189 size_t StringRef::rfind(StringRef Str) const { 190 size_t N = Str.size(); 191 if (N > Length) 192 return npos; 193 for (size_t i = Length - N + 1, e = 0; i != e;) { 194 --i; 195 if (substr(i, N).equals(Str)) 196 return i; 197 } 198 return npos; 199 } 200 201 /// find_first_of - Find the first character in the string that is in \arg 202 /// Chars, or npos if not found. 203 /// 204 /// Note: O(size() + Chars.size()) 205 StringRef::size_type StringRef::find_first_of(StringRef Chars, 206 size_t From) const { 207 std::bitset<1 << CHAR_BIT> CharBits; 208 for (size_type i = 0; i != Chars.size(); ++i) 209 CharBits.set((unsigned char)Chars[i]); 210 211 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 212 if (CharBits.test((unsigned char)Data[i])) 213 return i; 214 return npos; 215 } 216 217 /// find_first_not_of - Find the first character in the string that is not 218 /// \arg C or npos if not found. 219 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 220 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 221 if (Data[i] != C) 222 return i; 223 return npos; 224 } 225 226 /// find_first_not_of - Find the first character in the string that is not 227 /// in the string \arg Chars, or npos if not found. 228 /// 229 /// Note: O(size() + Chars.size()) 230 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 231 size_t From) const { 232 std::bitset<1 << CHAR_BIT> CharBits; 233 for (size_type i = 0; i != Chars.size(); ++i) 234 CharBits.set((unsigned char)Chars[i]); 235 236 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 237 if (!CharBits.test((unsigned char)Data[i])) 238 return i; 239 return npos; 240 } 241 242 /// find_last_of - Find the last character in the string that is in \arg C, 243 /// or npos if not found. 244 /// 245 /// Note: O(size() + Chars.size()) 246 StringRef::size_type StringRef::find_last_of(StringRef Chars, 247 size_t From) const { 248 std::bitset<1 << CHAR_BIT> CharBits; 249 for (size_type i = 0; i != Chars.size(); ++i) 250 CharBits.set((unsigned char)Chars[i]); 251 252 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 253 if (CharBits.test((unsigned char)Data[i])) 254 return i; 255 return npos; 256 } 257 258 /// find_last_not_of - Find the last character in the string that is not 259 /// \arg C, or npos if not found. 260 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 261 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 262 if (Data[i] != C) 263 return i; 264 return npos; 265 } 266 267 /// find_last_not_of - Find the last character in the string that is not in 268 /// \arg Chars, or npos if not found. 269 /// 270 /// Note: O(size() + Chars.size()) 271 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 272 size_t From) const { 273 std::bitset<1 << CHAR_BIT> CharBits; 274 for (size_type i = 0, e = Chars.size(); i != e; ++i) 275 CharBits.set((unsigned char)Chars[i]); 276 277 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 278 if (!CharBits.test((unsigned char)Data[i])) 279 return i; 280 return npos; 281 } 282 283 void StringRef::split(SmallVectorImpl<StringRef> &A, 284 StringRef Separator, int MaxSplit, 285 bool KeepEmpty) const { 286 StringRef S = *this; 287 288 // Count down from MaxSplit. When MaxSplit is -1, this will just split 289 // "forever". This doesn't support splitting more than 2^31 times 290 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 291 // but that seems unlikely to be useful. 292 while (MaxSplit-- != 0) { 293 size_t Idx = S.find(Separator); 294 if (Idx == npos) 295 break; 296 297 // Push this split. 298 if (KeepEmpty || Idx > 0) 299 A.push_back(S.slice(0, Idx)); 300 301 // Jump forward. 302 S = S.slice(Idx + Separator.size(), npos); 303 } 304 305 // Push the tail. 306 if (KeepEmpty || !S.empty()) 307 A.push_back(S); 308 } 309 310 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 311 int MaxSplit, bool KeepEmpty) const { 312 StringRef S = *this; 313 314 // Count down from MaxSplit. When MaxSplit is -1, this will just split 315 // "forever". This doesn't support splitting more than 2^31 times 316 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 317 // but that seems unlikely to be useful. 318 while (MaxSplit-- != 0) { 319 size_t Idx = S.find(Separator); 320 if (Idx == npos) 321 break; 322 323 // Push this split. 324 if (KeepEmpty || Idx > 0) 325 A.push_back(S.slice(0, Idx)); 326 327 // Jump forward. 328 S = S.slice(Idx + 1, npos); 329 } 330 331 // Push the tail. 332 if (KeepEmpty || !S.empty()) 333 A.push_back(S); 334 } 335 336 //===----------------------------------------------------------------------===// 337 // Helpful Algorithms 338 //===----------------------------------------------------------------------===// 339 340 /// count - Return the number of non-overlapped occurrences of \arg Str in 341 /// the string. 342 size_t StringRef::count(StringRef Str) const { 343 size_t Count = 0; 344 size_t N = Str.size(); 345 if (N > Length) 346 return 0; 347 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 348 if (substr(i, N).equals(Str)) 349 ++Count; 350 return Count; 351 } 352 353 static unsigned GetAutoSenseRadix(StringRef &Str) { 354 if (Str.startswith("0x") || Str.startswith("0X")) { 355 Str = Str.substr(2); 356 return 16; 357 } 358 359 if (Str.startswith("0b") || Str.startswith("0B")) { 360 Str = Str.substr(2); 361 return 2; 362 } 363 364 if (Str.startswith("0o")) { 365 Str = Str.substr(2); 366 return 8; 367 } 368 369 if (Str[0] == '0' && Str.size() > 1 && ascii_isdigit(Str[1])) { 370 Str = Str.substr(1); 371 return 8; 372 } 373 374 return 10; 375 } 376 377 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 378 unsigned long long &Result) { 379 // Autosense radix if not specified. 380 if (Radix == 0) 381 Radix = GetAutoSenseRadix(Str); 382 383 // Empty strings (after the radix autosense) are invalid. 384 if (Str.empty()) return true; 385 386 // Parse all the bytes of the string given this radix. Watch for overflow. 387 StringRef Str2 = Str; 388 Result = 0; 389 while (!Str2.empty()) { 390 unsigned CharVal; 391 if (Str2[0] >= '0' && Str2[0] <= '9') 392 CharVal = Str2[0] - '0'; 393 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 394 CharVal = Str2[0] - 'a' + 10; 395 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 396 CharVal = Str2[0] - 'A' + 10; 397 else 398 break; 399 400 // If the parsed value is larger than the integer radix, we cannot 401 // consume any more characters. 402 if (CharVal >= Radix) 403 break; 404 405 // Add in this character. 406 unsigned long long PrevResult = Result; 407 Result = Result * Radix + CharVal; 408 409 // Check for overflow by shifting back and seeing if bits were lost. 410 if (Result / Radix < PrevResult) 411 return true; 412 413 Str2 = Str2.substr(1); 414 } 415 416 // We consider the operation a failure if no characters were consumed 417 // successfully. 418 if (Str.size() == Str2.size()) 419 return true; 420 421 Str = Str2; 422 return false; 423 } 424 425 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 426 long long &Result) { 427 unsigned long long ULLVal; 428 429 // Handle positive strings first. 430 if (Str.empty() || Str.front() != '-') { 431 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 432 // Check for value so large it overflows a signed value. 433 (long long)ULLVal < 0) 434 return true; 435 Result = ULLVal; 436 return false; 437 } 438 439 // Get the positive part of the value. 440 StringRef Str2 = Str.drop_front(1); 441 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 442 // Reject values so large they'd overflow as negative signed, but allow 443 // "-0". This negates the unsigned so that the negative isn't undefined 444 // on signed overflow. 445 (long long)-ULLVal > 0) 446 return true; 447 448 Str = Str2; 449 Result = -ULLVal; 450 return false; 451 } 452 453 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 454 /// sequence of radix up to 36 to an unsigned long long value. 455 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 456 unsigned long long &Result) { 457 if (consumeUnsignedInteger(Str, Radix, Result)) 458 return true; 459 460 // For getAsUnsignedInteger, we require the whole string to be consumed or 461 // else we consider it a failure. 462 return !Str.empty(); 463 } 464 465 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 466 long long &Result) { 467 if (consumeSignedInteger(Str, Radix, Result)) 468 return true; 469 470 // For getAsSignedInteger, we require the whole string to be consumed or else 471 // we consider it a failure. 472 return !Str.empty(); 473 } 474 475 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 476 StringRef Str = *this; 477 478 // Autosense radix if not specified. 479 if (Radix == 0) 480 Radix = GetAutoSenseRadix(Str); 481 482 assert(Radix > 1 && Radix <= 36); 483 484 // Empty strings (after the radix autosense) are invalid. 485 if (Str.empty()) return true; 486 487 // Skip leading zeroes. This can be a significant improvement if 488 // it means we don't need > 64 bits. 489 while (!Str.empty() && Str.front() == '0') 490 Str = Str.substr(1); 491 492 // If it was nothing but zeroes.... 493 if (Str.empty()) { 494 Result = APInt(64, 0); 495 return false; 496 } 497 498 // (Over-)estimate the required number of bits. 499 unsigned Log2Radix = 0; 500 while ((1U << Log2Radix) < Radix) Log2Radix++; 501 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 502 503 unsigned BitWidth = Log2Radix * Str.size(); 504 if (BitWidth < Result.getBitWidth()) 505 BitWidth = Result.getBitWidth(); // don't shrink the result 506 else if (BitWidth > Result.getBitWidth()) 507 Result = Result.zext(BitWidth); 508 509 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 510 if (!IsPowerOf2Radix) { 511 // These must have the same bit-width as Result. 512 RadixAP = APInt(BitWidth, Radix); 513 CharAP = APInt(BitWidth, 0); 514 } 515 516 // Parse all the bytes of the string given this radix. 517 Result = 0; 518 while (!Str.empty()) { 519 unsigned CharVal; 520 if (Str[0] >= '0' && Str[0] <= '9') 521 CharVal = Str[0]-'0'; 522 else if (Str[0] >= 'a' && Str[0] <= 'z') 523 CharVal = Str[0]-'a'+10; 524 else if (Str[0] >= 'A' && Str[0] <= 'Z') 525 CharVal = Str[0]-'A'+10; 526 else 527 return true; 528 529 // If the parsed value is larger than the integer radix, the string is 530 // invalid. 531 if (CharVal >= Radix) 532 return true; 533 534 // Add in this character. 535 if (IsPowerOf2Radix) { 536 Result <<= Log2Radix; 537 Result |= CharVal; 538 } else { 539 Result *= RadixAP; 540 CharAP = CharVal; 541 Result += CharAP; 542 } 543 544 Str = Str.substr(1); 545 } 546 547 return false; 548 } 549 550 551 // Implementation of StringRef hashing. 552 hash_code llvm::hash_value(StringRef S) { 553 return hash_combine_range(S.begin(), S.end()); 554 } 555