1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/ADT/StringRef.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/edit_distance.h" 14 #include <bitset> 15 16 using namespace llvm; 17 18 // MSVC emits references to this into the translation units which reference it. 19 #ifndef _MSC_VER 20 const size_t StringRef::npos; 21 #endif 22 23 static char ascii_tolower(char x) { 24 if (x >= 'A' && x <= 'Z') 25 return x - 'A' + 'a'; 26 return x; 27 } 28 29 static char ascii_toupper(char x) { 30 if (x >= 'a' && x <= 'z') 31 return x - 'a' + 'A'; 32 return x; 33 } 34 35 static bool ascii_isdigit(char x) { 36 return x >= '0' && x <= '9'; 37 } 38 39 // strncasecmp() is not available on non-POSIX systems, so define an 40 // alternative function here. 41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 42 for (size_t I = 0; I < Length; ++I) { 43 unsigned char LHC = ascii_tolower(LHS[I]); 44 unsigned char RHC = ascii_tolower(RHS[I]); 45 if (LHC != RHC) 46 return LHC < RHC ? -1 : 1; 47 } 48 return 0; 49 } 50 51 /// compare_lower - Compare strings, ignoring case. 52 int StringRef::compare_lower(StringRef RHS) const { 53 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 54 return Res; 55 if (Length == RHS.Length) 56 return 0; 57 return Length < RHS.Length ? -1 : 1; 58 } 59 60 /// Check if this string starts with the given \p Prefix, ignoring case. 61 bool StringRef::startswith_lower(StringRef Prefix) const { 62 return Length >= Prefix.Length && 63 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 64 } 65 66 /// Check if this string ends with the given \p Suffix, ignoring case. 67 bool StringRef::endswith_lower(StringRef Suffix) const { 68 return Length >= Suffix.Length && 69 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 70 } 71 72 /// compare_numeric - Compare strings, handle embedded numbers. 73 int StringRef::compare_numeric(StringRef RHS) const { 74 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 75 // Check for sequences of digits. 76 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 77 // The longer sequence of numbers is considered larger. 78 // This doesn't really handle prefixed zeros well. 79 size_t J; 80 for (J = I + 1; J != E + 1; ++J) { 81 bool ld = J < Length && ascii_isdigit(Data[J]); 82 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 83 if (ld != rd) 84 return rd ? -1 : 1; 85 if (!rd) 86 break; 87 } 88 // The two number sequences have the same length (J-I), just memcmp them. 89 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 90 return Res < 0 ? -1 : 1; 91 // Identical number sequences, continue search after the numbers. 92 I = J - 1; 93 continue; 94 } 95 if (Data[I] != RHS.Data[I]) 96 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 97 } 98 if (Length == RHS.Length) 99 return 0; 100 return Length < RHS.Length ? -1 : 1; 101 } 102 103 // Compute the edit distance between the two given strings. 104 unsigned StringRef::edit_distance(llvm::StringRef Other, 105 bool AllowReplacements, 106 unsigned MaxEditDistance) const { 107 return llvm::ComputeEditDistance( 108 makeArrayRef(data(), size()), 109 makeArrayRef(Other.data(), Other.size()), 110 AllowReplacements, MaxEditDistance); 111 } 112 113 //===----------------------------------------------------------------------===// 114 // String Operations 115 //===----------------------------------------------------------------------===// 116 117 std::string StringRef::lower() const { 118 std::string Result(size(), char()); 119 for (size_type i = 0, e = size(); i != e; ++i) { 120 Result[i] = ascii_tolower(Data[i]); 121 } 122 return Result; 123 } 124 125 std::string StringRef::upper() const { 126 std::string Result(size(), char()); 127 for (size_type i = 0, e = size(); i != e; ++i) { 128 Result[i] = ascii_toupper(Data[i]); 129 } 130 return Result; 131 } 132 133 //===----------------------------------------------------------------------===// 134 // String Searching 135 //===----------------------------------------------------------------------===// 136 137 138 /// find - Search for the first string \arg Str in the string. 139 /// 140 /// \return - The index of the first occurrence of \arg Str, or npos if not 141 /// found. 142 size_t StringRef::find(StringRef Str, size_t From) const { 143 size_t N = Str.size(); 144 if (N > Length) 145 return npos; 146 147 // For short haystacks or unsupported needles fall back to the naive algorithm 148 if (Length < 16 || N > 255 || N == 0) { 149 for (size_t e = Length - N + 1, i = std::min(From, e); i != e; ++i) 150 if (substr(i, N).equals(Str)) 151 return i; 152 return npos; 153 } 154 155 if (From >= Length) 156 return npos; 157 158 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 159 uint8_t BadCharSkip[256]; 160 std::memset(BadCharSkip, N, 256); 161 for (unsigned i = 0; i != N-1; ++i) 162 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 163 164 unsigned Len = Length-From, Pos = From; 165 while (Len >= N) { 166 if (substr(Pos, N).equals(Str)) // See if this is the correct substring. 167 return Pos; 168 169 // Otherwise skip the appropriate number of bytes. 170 uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; 171 Len -= Skip; 172 Pos += Skip; 173 } 174 175 return npos; 176 } 177 178 /// rfind - Search for the last string \arg Str in the string. 179 /// 180 /// \return - The index of the last occurrence of \arg Str, or npos if not 181 /// found. 182 size_t StringRef::rfind(StringRef Str) const { 183 size_t N = Str.size(); 184 if (N > Length) 185 return npos; 186 for (size_t i = Length - N + 1, e = 0; i != e;) { 187 --i; 188 if (substr(i, N).equals(Str)) 189 return i; 190 } 191 return npos; 192 } 193 194 /// find_first_of - Find the first character in the string that is in \arg 195 /// Chars, or npos if not found. 196 /// 197 /// Note: O(size() + Chars.size()) 198 StringRef::size_type StringRef::find_first_of(StringRef Chars, 199 size_t From) const { 200 std::bitset<1 << CHAR_BIT> CharBits; 201 for (size_type i = 0; i != Chars.size(); ++i) 202 CharBits.set((unsigned char)Chars[i]); 203 204 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 205 if (CharBits.test((unsigned char)Data[i])) 206 return i; 207 return npos; 208 } 209 210 /// find_first_not_of - Find the first character in the string that is not 211 /// \arg C or npos if not found. 212 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 213 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 214 if (Data[i] != C) 215 return i; 216 return npos; 217 } 218 219 /// find_first_not_of - Find the first character in the string that is not 220 /// in the string \arg Chars, or npos if not found. 221 /// 222 /// Note: O(size() + Chars.size()) 223 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 224 size_t From) const { 225 std::bitset<1 << CHAR_BIT> CharBits; 226 for (size_type i = 0; i != Chars.size(); ++i) 227 CharBits.set((unsigned char)Chars[i]); 228 229 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 230 if (!CharBits.test((unsigned char)Data[i])) 231 return i; 232 return npos; 233 } 234 235 /// find_last_of - Find the last character in the string that is in \arg C, 236 /// or npos if not found. 237 /// 238 /// Note: O(size() + Chars.size()) 239 StringRef::size_type StringRef::find_last_of(StringRef Chars, 240 size_t From) const { 241 std::bitset<1 << CHAR_BIT> CharBits; 242 for (size_type i = 0; i != Chars.size(); ++i) 243 CharBits.set((unsigned char)Chars[i]); 244 245 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 246 if (CharBits.test((unsigned char)Data[i])) 247 return i; 248 return npos; 249 } 250 251 /// find_last_not_of - Find the last character in the string that is not 252 /// \arg C, or npos if not found. 253 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 254 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 255 if (Data[i] != C) 256 return i; 257 return npos; 258 } 259 260 /// find_last_not_of - Find the last character in the string that is not in 261 /// \arg Chars, or npos if not found. 262 /// 263 /// Note: O(size() + Chars.size()) 264 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 265 size_t From) const { 266 std::bitset<1 << CHAR_BIT> CharBits; 267 for (size_type i = 0, e = Chars.size(); i != e; ++i) 268 CharBits.set((unsigned char)Chars[i]); 269 270 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 271 if (!CharBits.test((unsigned char)Data[i])) 272 return i; 273 return npos; 274 } 275 276 void StringRef::split(SmallVectorImpl<StringRef> &A, 277 StringRef Separator, int MaxSplit, 278 bool KeepEmpty) const { 279 StringRef S = *this; 280 281 // Count down from MaxSplit. When MaxSplit is -1, this will just split 282 // "forever". This doesn't support splitting more than 2^31 times 283 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 284 // but that seems unlikely to be useful. 285 while (MaxSplit-- != 0) { 286 size_t Idx = S.find(Separator); 287 if (Idx == npos) 288 break; 289 290 // Push this split. 291 if (KeepEmpty || Idx > 0) 292 A.push_back(S.slice(0, Idx)); 293 294 // Jump forward. 295 S = S.slice(Idx + Separator.size(), npos); 296 } 297 298 // Push the tail. 299 if (KeepEmpty || !S.empty()) 300 A.push_back(S); 301 } 302 303 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 304 int MaxSplit, bool KeepEmpty) const { 305 StringRef S = *this; 306 307 // Count down from MaxSplit. When MaxSplit is -1, this will just split 308 // "forever". This doesn't support splitting more than 2^31 times 309 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 310 // but that seems unlikely to be useful. 311 while (MaxSplit-- != 0) { 312 size_t Idx = S.find(Separator); 313 if (Idx == npos) 314 break; 315 316 // Push this split. 317 if (KeepEmpty || Idx > 0) 318 A.push_back(S.slice(0, Idx)); 319 320 // Jump forward. 321 S = S.slice(Idx + 1, npos); 322 } 323 324 // Push the tail. 325 if (KeepEmpty || !S.empty()) 326 A.push_back(S); 327 } 328 329 //===----------------------------------------------------------------------===// 330 // Helpful Algorithms 331 //===----------------------------------------------------------------------===// 332 333 /// count - Return the number of non-overlapped occurrences of \arg Str in 334 /// the string. 335 size_t StringRef::count(StringRef Str) const { 336 size_t Count = 0; 337 size_t N = Str.size(); 338 if (N > Length) 339 return 0; 340 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 341 if (substr(i, N).equals(Str)) 342 ++Count; 343 return Count; 344 } 345 346 static unsigned GetAutoSenseRadix(StringRef &Str) { 347 if (Str.startswith("0x")) { 348 Str = Str.substr(2); 349 return 16; 350 } 351 352 if (Str.startswith("0b")) { 353 Str = Str.substr(2); 354 return 2; 355 } 356 357 if (Str.startswith("0o")) { 358 Str = Str.substr(2); 359 return 8; 360 } 361 362 if (Str.startswith("0")) 363 return 8; 364 365 return 10; 366 } 367 368 369 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 370 /// sequence of radix up to 36 to an unsigned long long value. 371 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 372 unsigned long long &Result) { 373 // Autosense radix if not specified. 374 if (Radix == 0) 375 Radix = GetAutoSenseRadix(Str); 376 377 // Empty strings (after the radix autosense) are invalid. 378 if (Str.empty()) return true; 379 380 // Parse all the bytes of the string given this radix. Watch for overflow. 381 Result = 0; 382 while (!Str.empty()) { 383 unsigned CharVal; 384 if (Str[0] >= '0' && Str[0] <= '9') 385 CharVal = Str[0]-'0'; 386 else if (Str[0] >= 'a' && Str[0] <= 'z') 387 CharVal = Str[0]-'a'+10; 388 else if (Str[0] >= 'A' && Str[0] <= 'Z') 389 CharVal = Str[0]-'A'+10; 390 else 391 return true; 392 393 // If the parsed value is larger than the integer radix, the string is 394 // invalid. 395 if (CharVal >= Radix) 396 return true; 397 398 // Add in this character. 399 unsigned long long PrevResult = Result; 400 Result = Result*Radix+CharVal; 401 402 // Check for overflow by shifting back and seeing if bits were lost. 403 if (Result/Radix < PrevResult) 404 return true; 405 406 Str = Str.substr(1); 407 } 408 409 return false; 410 } 411 412 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 413 long long &Result) { 414 unsigned long long ULLVal; 415 416 // Handle positive strings first. 417 if (Str.empty() || Str.front() != '-') { 418 if (getAsUnsignedInteger(Str, Radix, ULLVal) || 419 // Check for value so large it overflows a signed value. 420 (long long)ULLVal < 0) 421 return true; 422 Result = ULLVal; 423 return false; 424 } 425 426 // Get the positive part of the value. 427 if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || 428 // Reject values so large they'd overflow as negative signed, but allow 429 // "-0". This negates the unsigned so that the negative isn't undefined 430 // on signed overflow. 431 (long long)-ULLVal > 0) 432 return true; 433 434 Result = -ULLVal; 435 return false; 436 } 437 438 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 439 StringRef Str = *this; 440 441 // Autosense radix if not specified. 442 if (Radix == 0) 443 Radix = GetAutoSenseRadix(Str); 444 445 assert(Radix > 1 && Radix <= 36); 446 447 // Empty strings (after the radix autosense) are invalid. 448 if (Str.empty()) return true; 449 450 // Skip leading zeroes. This can be a significant improvement if 451 // it means we don't need > 64 bits. 452 while (!Str.empty() && Str.front() == '0') 453 Str = Str.substr(1); 454 455 // If it was nothing but zeroes.... 456 if (Str.empty()) { 457 Result = APInt(64, 0); 458 return false; 459 } 460 461 // (Over-)estimate the required number of bits. 462 unsigned Log2Radix = 0; 463 while ((1U << Log2Radix) < Radix) Log2Radix++; 464 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 465 466 unsigned BitWidth = Log2Radix * Str.size(); 467 if (BitWidth < Result.getBitWidth()) 468 BitWidth = Result.getBitWidth(); // don't shrink the result 469 else if (BitWidth > Result.getBitWidth()) 470 Result = Result.zext(BitWidth); 471 472 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 473 if (!IsPowerOf2Radix) { 474 // These must have the same bit-width as Result. 475 RadixAP = APInt(BitWidth, Radix); 476 CharAP = APInt(BitWidth, 0); 477 } 478 479 // Parse all the bytes of the string given this radix. 480 Result = 0; 481 while (!Str.empty()) { 482 unsigned CharVal; 483 if (Str[0] >= '0' && Str[0] <= '9') 484 CharVal = Str[0]-'0'; 485 else if (Str[0] >= 'a' && Str[0] <= 'z') 486 CharVal = Str[0]-'a'+10; 487 else if (Str[0] >= 'A' && Str[0] <= 'Z') 488 CharVal = Str[0]-'A'+10; 489 else 490 return true; 491 492 // If the parsed value is larger than the integer radix, the string is 493 // invalid. 494 if (CharVal >= Radix) 495 return true; 496 497 // Add in this character. 498 if (IsPowerOf2Radix) { 499 Result <<= Log2Radix; 500 Result |= CharVal; 501 } else { 502 Result *= RadixAP; 503 CharAP = CharVal; 504 Result += CharAP; 505 } 506 507 Str = Str.substr(1); 508 } 509 510 return false; 511 } 512 513 514 // Implementation of StringRef hashing. 515 hash_code llvm::hash_value(StringRef S) { 516 return hash_combine_range(S.begin(), S.end()); 517 } 518