1 #include "clang/AST/CommentLexer.h" 2 #include "clang/Basic/ConvertUTF.h" 3 #include "llvm/ADT/StringSwitch.h" 4 #include "llvm/Support/ErrorHandling.h" 5 6 namespace clang { 7 namespace comments { 8 9 void Token::dump(const Lexer &L, const SourceManager &SM) const { 10 llvm::errs() << "comments::Token Kind=" << Kind << " "; 11 Loc.dump(SM); 12 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 13 } 14 15 bool Lexer::isVerbatimBlockCommand(StringRef BeginName, 16 StringRef &EndName) const { 17 const char *Result = llvm::StringSwitch<const char *>(BeginName) 18 .Case("code", "endcode") 19 .Case("verbatim", "endverbatim") 20 .Case("htmlonly", "endhtmlonly") 21 .Case("latexonly", "endlatexonly") 22 .Case("xmlonly", "endxmlonly") 23 .Case("manonly", "endmanonly") 24 .Case("rtfonly", "endrtfonly") 25 26 .Case("dot", "enddot") 27 .Case("msc", "endmsc") 28 29 .Case("f$", "f$") // Inline LaTeX formula 30 .Case("f[", "f]") // Displayed LaTeX formula 31 .Case("f{", "f}") // LaTeX environment 32 33 .Default(NULL); 34 35 if (Result) { 36 EndName = Result; 37 return true; 38 } 39 40 for (VerbatimBlockCommandVector::const_iterator 41 I = VerbatimBlockCommands.begin(), 42 E = VerbatimBlockCommands.end(); 43 I != E; ++I) 44 if (I->BeginName == BeginName) { 45 EndName = I->EndName; 46 return true; 47 } 48 49 return false; 50 } 51 52 bool Lexer::isVerbatimLineCommand(StringRef Name) const { 53 bool Result = llvm::StringSwitch<bool>(Name) 54 .Case("fn", true) 55 .Case("var", true) 56 .Case("property", true) 57 .Case("typedef", true) 58 59 .Case("overload", true) 60 61 .Case("defgroup", true) 62 .Case("ingroup", true) 63 .Case("addtogroup", true) 64 .Case("weakgroup", true) 65 .Case("name", true) 66 67 .Case("section", true) 68 .Case("subsection", true) 69 .Case("subsubsection", true) 70 .Case("paragraph", true) 71 72 .Case("mainpage", true) 73 .Case("subpage", true) 74 .Case("ref", true) 75 76 .Default(false); 77 78 if (Result) 79 return true; 80 81 for (VerbatimLineCommandVector::const_iterator 82 I = VerbatimLineCommands.begin(), 83 E = VerbatimLineCommands.end(); 84 I != E; ++I) 85 if (I->Name == Name) 86 return true; 87 88 return false; 89 } 90 91 namespace { 92 bool isHTMLNamedCharacterReferenceCharacter(char C) { 93 return (C >= 'a' && C <= 'z') || 94 (C >= 'A' && C <= 'Z'); 95 } 96 97 bool isHTMLDecimalCharacterReferenceCharacter(char C) { 98 return C >= '0' && C <= '9'; 99 } 100 101 bool isHTMLHexCharacterReferenceCharacter(char C) { 102 return (C >= '0' && C <= '9') || 103 (C >= 'a' && C <= 'f') || 104 (C >= 'A' && C <= 'F'); 105 } 106 } // unnamed namespace 107 108 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 109 return llvm::StringSwitch<StringRef>(Name) 110 .Case("amp", "&") 111 .Case("lt", "<") 112 .Case("gt", ">") 113 .Case("quot", "\"") 114 .Case("apos", "\'") 115 .Default(""); 116 } 117 118 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 119 unsigned CodePoint = 0; 120 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 121 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 122 CodePoint *= 10; 123 CodePoint += Name[i] - '0'; 124 } 125 126 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 127 char *ResolvedPtr = Resolved; 128 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 129 return StringRef(Resolved, ResolvedPtr - Resolved); 130 else 131 return StringRef(); 132 } 133 134 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 135 unsigned CodePoint = 0; 136 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 137 CodePoint *= 16; 138 const char C = Name[i]; 139 assert(isHTMLHexCharacterReferenceCharacter(C)); 140 if (C >= '0' && C <= '9') 141 CodePoint += Name[i] - '0'; 142 else if (C >= 'a' && C <= 'f') 143 CodePoint += Name[i] - 'a' + 10; 144 else 145 CodePoint += Name[i] - 'A' + 10; 146 } 147 148 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 149 char *ResolvedPtr = Resolved; 150 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 151 return StringRef(Resolved, ResolvedPtr - Resolved); 152 else 153 return StringRef(); 154 } 155 156 void Lexer::skipLineStartingDecorations() { 157 // This function should be called only for C comments 158 assert(CommentState == LCS_InsideCComment); 159 160 if (BufferPtr == CommentEnd) 161 return; 162 163 switch (*BufferPtr) { 164 case ' ': 165 case '\t': 166 case '\f': 167 case '\v': { 168 const char *NewBufferPtr = BufferPtr; 169 NewBufferPtr++; 170 if (NewBufferPtr == CommentEnd) 171 return; 172 173 char C = *NewBufferPtr; 174 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') { 175 NewBufferPtr++; 176 if (NewBufferPtr == CommentEnd) 177 return; 178 C = *NewBufferPtr; 179 } 180 if (C == '*') 181 BufferPtr = NewBufferPtr + 1; 182 break; 183 } 184 case '*': 185 BufferPtr++; 186 break; 187 } 188 } 189 190 namespace { 191 /// Returns pointer to the first newline character in the string. 192 const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 193 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 194 const char C = *BufferPtr; 195 if (C == '\n' || C == '\r') 196 return BufferPtr; 197 } 198 return BufferEnd; 199 } 200 201 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 202 if (BufferPtr == BufferEnd) 203 return BufferPtr; 204 205 if (*BufferPtr == '\n') 206 BufferPtr++; 207 else { 208 assert(*BufferPtr == '\r'); 209 BufferPtr++; 210 if (BufferPtr != BufferEnd && *BufferPtr == '\n') 211 BufferPtr++; 212 } 213 return BufferPtr; 214 } 215 216 const char *skipNamedCharacterReference(const char *BufferPtr, 217 const char *BufferEnd) { 218 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 219 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 220 return BufferPtr; 221 } 222 return BufferEnd; 223 } 224 225 const char *skipDecimalCharacterReference(const char *BufferPtr, 226 const char *BufferEnd) { 227 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 228 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 229 return BufferPtr; 230 } 231 return BufferEnd; 232 } 233 234 const char *skipHexCharacterReference(const char *BufferPtr, 235 const char *BufferEnd) { 236 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 237 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 238 return BufferPtr; 239 } 240 return BufferEnd; 241 } 242 243 bool isHTMLIdentifierStartingCharacter(char C) { 244 return (C >= 'a' && C <= 'z') || 245 (C >= 'A' && C <= 'Z'); 246 } 247 248 bool isHTMLIdentifierCharacter(char C) { 249 return (C >= 'a' && C <= 'z') || 250 (C >= 'A' && C <= 'Z') || 251 (C >= '0' && C <= '9'); 252 } 253 254 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 255 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 256 if (!isHTMLIdentifierCharacter(*BufferPtr)) 257 return BufferPtr; 258 } 259 return BufferEnd; 260 } 261 262 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside 263 /// string allowed. 264 /// 265 /// Returns pointer to closing quote. 266 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 267 { 268 const char Quote = *BufferPtr; 269 assert(Quote == '\"' || Quote == '\''); 270 271 BufferPtr++; 272 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 273 const char C = *BufferPtr; 274 if (C == Quote && BufferPtr[-1] != '\\') 275 return BufferPtr; 276 } 277 return BufferEnd; 278 } 279 280 bool isHorizontalWhitespace(char C) { 281 return C == ' ' || C == '\t' || C == '\f' || C == '\v'; 282 } 283 284 bool isWhitespace(char C) { 285 return C == ' ' || C == '\n' || C == '\r' || 286 C == '\t' || C == '\f' || C == '\v'; 287 } 288 289 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 290 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 291 if (!isWhitespace(*BufferPtr)) 292 return BufferPtr; 293 } 294 return BufferEnd; 295 } 296 297 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 298 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 299 } 300 301 bool isCommandNameCharacter(char C) { 302 return (C >= 'a' && C <= 'z') || 303 (C >= 'A' && C <= 'Z') || 304 (C >= '0' && C <= '9'); 305 } 306 307 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 308 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 309 if (!isCommandNameCharacter(*BufferPtr)) 310 return BufferPtr; 311 } 312 return BufferEnd; 313 } 314 315 /// Return the one past end pointer for BCPL comments. 316 /// Handles newlines escaped with backslash or trigraph for backslahs. 317 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 318 const char *CurPtr = BufferPtr; 319 while (CurPtr != BufferEnd) { 320 char C = *CurPtr; 321 while (C != '\n' && C != '\r') { 322 CurPtr++; 323 if (CurPtr == BufferEnd) 324 return BufferEnd; 325 C = *CurPtr; 326 } 327 // We found a newline, check if it is escaped. 328 const char *EscapePtr = CurPtr - 1; 329 while(isHorizontalWhitespace(*EscapePtr)) 330 EscapePtr--; 331 332 if (*EscapePtr == '\\' || 333 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 334 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 335 // We found an escaped newline. 336 CurPtr = skipNewline(CurPtr, BufferEnd); 337 } else 338 return CurPtr; // Not an escaped newline. 339 } 340 return BufferEnd; 341 } 342 343 /// Return the one past end pointer for C comments. 344 /// Very dumb, does not handle escaped newlines or trigraphs. 345 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 346 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 347 if (*BufferPtr == '*') { 348 assert(BufferPtr + 1 != BufferEnd); 349 if (*(BufferPtr + 1) == '/') 350 return BufferPtr; 351 } 352 } 353 llvm_unreachable("buffer end hit before '*/' was seen"); 354 } 355 } // unnamed namespace 356 357 void Lexer::lexCommentText(Token &T) { 358 assert(CommentState == LCS_InsideBCPLComment || 359 CommentState == LCS_InsideCComment); 360 361 switch (State) { 362 case LS_Normal: 363 break; 364 case LS_VerbatimBlockFirstLine: 365 lexVerbatimBlockFirstLine(T); 366 return; 367 case LS_VerbatimBlockBody: 368 lexVerbatimBlockBody(T); 369 return; 370 case LS_VerbatimLineText: 371 lexVerbatimLineText(T); 372 return; 373 case LS_HTMLStartTag: 374 lexHTMLStartTag(T); 375 return; 376 case LS_HTMLEndTag: 377 lexHTMLEndTag(T); 378 return; 379 } 380 381 assert(State == LS_Normal); 382 383 const char *TokenPtr = BufferPtr; 384 assert(TokenPtr < CommentEnd); 385 while (TokenPtr != CommentEnd) { 386 switch(*TokenPtr) { 387 case '\\': 388 case '@': { 389 TokenPtr++; 390 if (TokenPtr == CommentEnd) { 391 formTextToken(T, TokenPtr); 392 return; 393 } 394 char C = *TokenPtr; 395 switch (C) { 396 default: 397 break; 398 399 case '\\': case '@': case '&': case '$': 400 case '#': case '<': case '>': case '%': 401 case '\"': case '.': case ':': 402 // This is one of \\ \@ \& \$ etc escape sequences. 403 TokenPtr++; 404 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 405 // This is the \:: escape sequence. 406 TokenPtr++; 407 } 408 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 409 formTokenWithChars(T, TokenPtr, tok::text); 410 T.setText(UnescapedText); 411 return; 412 } 413 414 // Don't make zero-length commands. 415 if (!isCommandNameCharacter(*TokenPtr)) { 416 formTextToken(T, TokenPtr); 417 return; 418 } 419 420 TokenPtr = skipCommandName(TokenPtr, CommentEnd); 421 unsigned Length = TokenPtr - (BufferPtr + 1); 422 423 // Hardcoded support for lexing LaTeX formula commands 424 // \f$ \f[ \f] \f{ \f} as a single command. 425 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 426 C = *TokenPtr; 427 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 428 TokenPtr++; 429 Length++; 430 } 431 } 432 433 const StringRef CommandName(BufferPtr + 1, Length); 434 StringRef EndName; 435 436 if (isVerbatimBlockCommand(CommandName, EndName)) { 437 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName); 438 return; 439 } 440 if (isVerbatimLineCommand(CommandName)) { 441 setupAndLexVerbatimLine(T, TokenPtr); 442 return; 443 } 444 formTokenWithChars(T, TokenPtr, tok::command); 445 T.setCommandName(CommandName); 446 return; 447 } 448 449 case '&': 450 lexHTMLCharacterReference(T); 451 return; 452 453 case '<': { 454 TokenPtr++; 455 if (TokenPtr == CommentEnd) { 456 formTextToken(T, TokenPtr); 457 return; 458 } 459 const char C = *TokenPtr; 460 if (isHTMLIdentifierStartingCharacter(C)) 461 setupAndLexHTMLStartTag(T); 462 else if (C == '/') 463 setupAndLexHTMLEndTag(T); 464 else 465 formTextToken(T, TokenPtr); 466 467 return; 468 } 469 470 case '\n': 471 case '\r': 472 TokenPtr = skipNewline(TokenPtr, CommentEnd); 473 formTokenWithChars(T, TokenPtr, tok::newline); 474 475 if (CommentState == LCS_InsideCComment) 476 skipLineStartingDecorations(); 477 return; 478 479 default: { 480 while (true) { 481 TokenPtr++; 482 if (TokenPtr == CommentEnd) 483 break; 484 const char C = *TokenPtr; 485 if(C == '\n' || C == '\r' || 486 C == '\\' || C == '@' || C == '&' || C == '<') 487 break; 488 } 489 formTextToken(T, TokenPtr); 490 return; 491 } 492 } 493 } 494 } 495 496 void Lexer::setupAndLexVerbatimBlock(Token &T, 497 const char *TextBegin, 498 char Marker, StringRef EndName) { 499 VerbatimBlockEndCommandName.clear(); 500 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 501 VerbatimBlockEndCommandName.append(EndName); 502 503 StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1)); 504 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 505 T.setVerbatimBlockName(Name); 506 507 // If there is a newline following the verbatim opening command, skip the 508 // newline so that we don't create an tok::verbatim_block_line with empty 509 // text content. 510 if (BufferPtr != CommentEnd) { 511 const char C = *BufferPtr; 512 if (C == '\n' || C == '\r') { 513 BufferPtr = skipNewline(BufferPtr, CommentEnd); 514 State = LS_VerbatimBlockBody; 515 return; 516 } 517 } 518 519 State = LS_VerbatimBlockFirstLine; 520 } 521 522 void Lexer::lexVerbatimBlockFirstLine(Token &T) { 523 again: 524 assert(BufferPtr < CommentEnd); 525 526 // FIXME: It would be better to scan the text once, finding either the block 527 // end command or newline. 528 // 529 // Extract current line. 530 const char *Newline = findNewline(BufferPtr, CommentEnd); 531 StringRef Line(BufferPtr, Newline - BufferPtr); 532 533 // Look for end command in current line. 534 size_t Pos = Line.find(VerbatimBlockEndCommandName); 535 const char *TextEnd; 536 const char *NextLine; 537 if (Pos == StringRef::npos) { 538 // Current line is completely verbatim. 539 TextEnd = Newline; 540 NextLine = skipNewline(Newline, CommentEnd); 541 } else if (Pos == 0) { 542 // Current line contains just an end command. 543 const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 544 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 545 formTokenWithChars(T, End, tok::verbatim_block_end); 546 T.setVerbatimBlockName(Name); 547 State = LS_Normal; 548 return; 549 } else { 550 // There is some text, followed by end command. Extract text first. 551 TextEnd = BufferPtr + Pos; 552 NextLine = TextEnd; 553 // If there is only whitespace before end command, skip whitespace. 554 if (isWhitespace(BufferPtr, TextEnd)) { 555 BufferPtr = TextEnd; 556 goto again; 557 } 558 } 559 560 StringRef Text(BufferPtr, TextEnd - BufferPtr); 561 formTokenWithChars(T, NextLine, tok::verbatim_block_line); 562 T.setVerbatimBlockText(Text); 563 564 State = LS_VerbatimBlockBody; 565 } 566 567 void Lexer::lexVerbatimBlockBody(Token &T) { 568 assert(State == LS_VerbatimBlockBody); 569 570 if (CommentState == LCS_InsideCComment) 571 skipLineStartingDecorations(); 572 573 lexVerbatimBlockFirstLine(T); 574 } 575 576 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) { 577 const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1); 578 formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 579 T.setVerbatimLineName(Name); 580 581 State = LS_VerbatimLineText; 582 } 583 584 void Lexer::lexVerbatimLineText(Token &T) { 585 assert(State == LS_VerbatimLineText); 586 587 // Extract current line. 588 const char *Newline = findNewline(BufferPtr, CommentEnd); 589 const StringRef Text(BufferPtr, Newline - BufferPtr); 590 formTokenWithChars(T, Newline, tok::verbatim_line_text); 591 T.setVerbatimLineText(Text); 592 593 State = LS_Normal; 594 } 595 596 void Lexer::lexHTMLCharacterReference(Token &T) { 597 const char *TokenPtr = BufferPtr; 598 assert(*TokenPtr == '&'); 599 TokenPtr++; 600 if (TokenPtr == CommentEnd) { 601 formTextToken(T, TokenPtr); 602 return; 603 } 604 const char *NamePtr; 605 bool isNamed = false; 606 bool isDecimal = false; 607 char C = *TokenPtr; 608 if (isHTMLNamedCharacterReferenceCharacter(C)) { 609 NamePtr = TokenPtr; 610 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 611 isNamed = true; 612 } else if (C == '#') { 613 TokenPtr++; 614 if (TokenPtr == CommentEnd) { 615 formTextToken(T, TokenPtr); 616 return; 617 } 618 C = *TokenPtr; 619 if (isHTMLDecimalCharacterReferenceCharacter(C)) { 620 NamePtr = TokenPtr; 621 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 622 isDecimal = true; 623 } else if (C == 'x' || C == 'X') { 624 TokenPtr++; 625 NamePtr = TokenPtr; 626 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 627 } else { 628 formTextToken(T, TokenPtr); 629 return; 630 } 631 } else { 632 formTextToken(T, TokenPtr); 633 return; 634 } 635 if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 636 *TokenPtr != ';') { 637 formTextToken(T, TokenPtr); 638 return; 639 } 640 StringRef Name(NamePtr, TokenPtr - NamePtr); 641 TokenPtr++; // Skip semicolon. 642 StringRef Resolved; 643 if (isNamed) 644 Resolved = resolveHTMLNamedCharacterReference(Name); 645 else if (isDecimal) 646 Resolved = resolveHTMLDecimalCharacterReference(Name); 647 else 648 Resolved = resolveHTMLHexCharacterReference(Name); 649 650 if (Resolved.empty()) { 651 formTextToken(T, TokenPtr); 652 return; 653 } 654 formTokenWithChars(T, TokenPtr, tok::text); 655 T.setText(Resolved); 656 return; 657 } 658 659 void Lexer::setupAndLexHTMLStartTag(Token &T) { 660 assert(BufferPtr[0] == '<' && 661 isHTMLIdentifierStartingCharacter(BufferPtr[1])); 662 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 663 664 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 665 formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 666 T.setHTMLTagStartName(Name); 667 668 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 669 670 const char C = *BufferPtr; 671 if (BufferPtr != CommentEnd && 672 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 673 State = LS_HTMLStartTag; 674 } 675 676 void Lexer::lexHTMLStartTag(Token &T) { 677 assert(State == LS_HTMLStartTag); 678 679 const char *TokenPtr = BufferPtr; 680 char C = *TokenPtr; 681 if (isHTMLIdentifierCharacter(C)) { 682 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 683 StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 684 formTokenWithChars(T, TokenPtr, tok::html_ident); 685 T.setHTMLIdent(Ident); 686 } else { 687 switch (C) { 688 case '=': 689 TokenPtr++; 690 formTokenWithChars(T, TokenPtr, tok::html_equals); 691 break; 692 case '\"': 693 case '\'': { 694 const char *OpenQuote = TokenPtr; 695 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 696 const char *ClosingQuote = TokenPtr; 697 if (TokenPtr != CommentEnd) // Skip closing quote. 698 TokenPtr++; 699 formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 700 T.setHTMLQuotedString(StringRef(OpenQuote + 1, 701 ClosingQuote - (OpenQuote + 1))); 702 break; 703 } 704 case '>': 705 TokenPtr++; 706 formTokenWithChars(T, TokenPtr, tok::html_greater); 707 State = LS_Normal; 708 return; 709 case '/': 710 TokenPtr++; 711 if (TokenPtr != CommentEnd && *TokenPtr == '>') { 712 TokenPtr++; 713 formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 714 } else 715 formTextToken(T, TokenPtr); 716 717 State = LS_Normal; 718 return; 719 } 720 } 721 722 // Now look ahead and return to normal state if we don't see any HTML tokens 723 // ahead. 724 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 725 if (BufferPtr == CommentEnd) { 726 State = LS_Normal; 727 return; 728 } 729 730 C = *BufferPtr; 731 if (!isHTMLIdentifierStartingCharacter(C) && 732 C != '=' && C != '\"' && C != '\'' && C != '>') { 733 State = LS_Normal; 734 return; 735 } 736 } 737 738 void Lexer::setupAndLexHTMLEndTag(Token &T) { 739 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 740 741 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 742 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 743 744 const char *End = skipWhitespace(TagNameEnd, CommentEnd); 745 746 formTokenWithChars(T, End, tok::html_end_tag); 747 T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin)); 748 749 if (BufferPtr != CommentEnd && *BufferPtr == '>') 750 State = LS_HTMLEndTag; 751 } 752 753 void Lexer::lexHTMLEndTag(Token &T) { 754 assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 755 756 formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 757 State = LS_Normal; 758 } 759 760 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, 761 SourceLocation FileLoc, const CommentOptions &CommOpts, 762 const char *BufferStart, const char *BufferEnd): 763 Allocator(Allocator), 764 BufferStart(BufferStart), BufferEnd(BufferEnd), 765 FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart), 766 CommentState(LCS_BeforeComment), State(LS_Normal) { 767 } 768 769 void Lexer::lex(Token &T) { 770 again: 771 switch (CommentState) { 772 case LCS_BeforeComment: 773 if (BufferPtr == BufferEnd) { 774 formTokenWithChars(T, BufferPtr, tok::eof); 775 return; 776 } 777 778 assert(*BufferPtr == '/'); 779 BufferPtr++; // Skip first slash. 780 switch(*BufferPtr) { 781 case '/': { // BCPL comment. 782 BufferPtr++; // Skip second slash. 783 784 if (BufferPtr != BufferEnd) { 785 // Skip Doxygen magic marker, if it is present. 786 // It might be missing because of a typo //< or /*<, or because we 787 // merged this non-Doxygen comment into a bunch of Doxygen comments 788 // around it: /** ... */ /* ... */ /** ... */ 789 const char C = *BufferPtr; 790 if (C == '/' || C == '!') 791 BufferPtr++; 792 } 793 794 // Skip less-than symbol that marks trailing comments. 795 // Skip it even if the comment is not a Doxygen one, because //< and /*< 796 // are frequent typos. 797 if (BufferPtr != BufferEnd && *BufferPtr == '<') 798 BufferPtr++; 799 800 CommentState = LCS_InsideBCPLComment; 801 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 802 State = LS_Normal; 803 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 804 goto again; 805 } 806 case '*': { // C comment. 807 BufferPtr++; // Skip star. 808 809 // Skip Doxygen magic marker. 810 const char C = *BufferPtr; 811 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 812 BufferPtr++; 813 814 // Skip less-than symbol that marks trailing comments. 815 if (BufferPtr != BufferEnd && *BufferPtr == '<') 816 BufferPtr++; 817 818 CommentState = LCS_InsideCComment; 819 State = LS_Normal; 820 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 821 goto again; 822 } 823 default: 824 llvm_unreachable("second character of comment should be '/' or '*'"); 825 } 826 827 case LCS_BetweenComments: { 828 // Consecutive comments are extracted only if there is only whitespace 829 // between them. So we can search for the start of the next comment. 830 const char *EndWhitespace = BufferPtr; 831 while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 832 EndWhitespace++; 833 834 // Turn any whitespace between comments (and there is only whitespace 835 // between them -- guaranteed by comment extraction) into a newline. We 836 // have two newlines between C comments in total (first one was synthesized 837 // after a comment). 838 formTokenWithChars(T, EndWhitespace, tok::newline); 839 840 CommentState = LCS_BeforeComment; 841 break; 842 } 843 844 case LCS_InsideBCPLComment: 845 case LCS_InsideCComment: 846 if (BufferPtr != CommentEnd) { 847 lexCommentText(T); 848 break; 849 } else { 850 // Skip C comment closing sequence. 851 if (CommentState == LCS_InsideCComment) { 852 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 853 BufferPtr += 2; 854 assert(BufferPtr <= BufferEnd); 855 856 // Synthenize newline just after the C comment, regardless if there is 857 // actually a newline. 858 formTokenWithChars(T, BufferPtr, tok::newline); 859 860 CommentState = LCS_BetweenComments; 861 break; 862 } else { 863 // Don't synthesized a newline after BCPL comment. 864 CommentState = LCS_BetweenComments; 865 goto again; 866 } 867 } 868 } 869 } 870 871 StringRef Lexer::getSpelling(const Token &Tok, 872 const SourceManager &SourceMgr, 873 bool *Invalid) const { 874 SourceLocation Loc = Tok.getLocation(); 875 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 876 877 bool InvalidTemp = false; 878 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 879 if (InvalidTemp) { 880 *Invalid = true; 881 return StringRef(); 882 } 883 884 const char *Begin = File.data() + LocInfo.second; 885 return StringRef(Begin, Tok.getLength()); 886 } 887 888 void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) { 889 VerbatimBlockCommand VBC; 890 VBC.BeginName = BeginName; 891 VBC.EndName = EndName; 892 VerbatimBlockCommands.push_back(VBC); 893 } 894 895 void Lexer::addVerbatimLineCommand(StringRef Name) { 896 VerbatimLineCommand VLC; 897 VLC.Name = Name; 898 VerbatimLineCommands.push_back(VLC); 899 } 900 901 } // end namespace comments 902 } // end namespace clang 903 904