1 //===--- CommentLexer.cpp -------------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "clang/AST/CommentLexer.h" 11 #include "clang/AST/CommentCommandTraits.h" 12 #include "clang/AST/CommentDiagnostic.h" 13 #include "clang/Basic/CharInfo.h" 14 #include "llvm/ADT/StringExtras.h" 15 #include "llvm/ADT/StringSwitch.h" 16 #include "llvm/Support/ConvertUTF.h" 17 #include "llvm/Support/ErrorHandling.h" 18 19 namespace clang { 20 namespace comments { 21 22 void Token::dump(const Lexer &L, const SourceManager &SM) const { 23 llvm::errs() << "comments::Token Kind=" << Kind << " "; 24 Loc.dump(SM); 25 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 26 } 27 28 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 29 return isLetter(C); 30 } 31 32 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 33 return isDigit(C); 34 } 35 36 static inline bool isHTMLHexCharacterReferenceCharacter(char C) { 37 return isHexDigit(C); 38 } 39 40 static inline StringRef convertCodePointToUTF8( 41 llvm::BumpPtrAllocator &Allocator, 42 unsigned CodePoint) { 43 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 44 char *ResolvedPtr = Resolved; 45 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 46 return StringRef(Resolved, ResolvedPtr - Resolved); 47 else 48 return StringRef(); 49 } 50 51 namespace { 52 53 #include "clang/AST/CommentHTMLTags.inc" 54 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 55 56 } // end anonymous namespace 57 58 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 59 // Fast path, first check a few most widely used named character references. 60 return llvm::StringSwitch<StringRef>(Name) 61 .Case("amp", "&") 62 .Case("lt", "<") 63 .Case("gt", ">") 64 .Case("quot", "\"") 65 .Case("apos", "\'") 66 // Slow path. 67 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 68 } 69 70 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 71 unsigned CodePoint = 0; 72 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 73 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 74 CodePoint *= 10; 75 CodePoint += Name[i] - '0'; 76 } 77 return convertCodePointToUTF8(Allocator, CodePoint); 78 } 79 80 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 81 unsigned CodePoint = 0; 82 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 83 CodePoint *= 16; 84 const char C = Name[i]; 85 assert(isHTMLHexCharacterReferenceCharacter(C)); 86 CodePoint += llvm::hexDigitValue(C); 87 } 88 return convertCodePointToUTF8(Allocator, CodePoint); 89 } 90 91 void Lexer::skipLineStartingDecorations() { 92 // This function should be called only for C comments 93 assert(CommentState == LCS_InsideCComment); 94 95 if (BufferPtr == CommentEnd) 96 return; 97 98 switch (*BufferPtr) { 99 case ' ': 100 case '\t': 101 case '\f': 102 case '\v': { 103 const char *NewBufferPtr = BufferPtr; 104 NewBufferPtr++; 105 if (NewBufferPtr == CommentEnd) 106 return; 107 108 char C = *NewBufferPtr; 109 while (isHorizontalWhitespace(C)) { 110 NewBufferPtr++; 111 if (NewBufferPtr == CommentEnd) 112 return; 113 C = *NewBufferPtr; 114 } 115 if (C == '*') 116 BufferPtr = NewBufferPtr + 1; 117 break; 118 } 119 case '*': 120 BufferPtr++; 121 break; 122 } 123 } 124 125 namespace { 126 /// Returns pointer to the first newline character in the string. 127 const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 128 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 129 if (isVerticalWhitespace(*BufferPtr)) 130 return BufferPtr; 131 } 132 return BufferEnd; 133 } 134 135 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 136 if (BufferPtr == BufferEnd) 137 return BufferPtr; 138 139 if (*BufferPtr == '\n') 140 BufferPtr++; 141 else { 142 assert(*BufferPtr == '\r'); 143 BufferPtr++; 144 if (BufferPtr != BufferEnd && *BufferPtr == '\n') 145 BufferPtr++; 146 } 147 return BufferPtr; 148 } 149 150 const char *skipNamedCharacterReference(const char *BufferPtr, 151 const char *BufferEnd) { 152 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 154 return BufferPtr; 155 } 156 return BufferEnd; 157 } 158 159 const char *skipDecimalCharacterReference(const char *BufferPtr, 160 const char *BufferEnd) { 161 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 163 return BufferPtr; 164 } 165 return BufferEnd; 166 } 167 168 const char *skipHexCharacterReference(const char *BufferPtr, 169 const char *BufferEnd) { 170 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 171 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 172 return BufferPtr; 173 } 174 return BufferEnd; 175 } 176 177 bool isHTMLIdentifierStartingCharacter(char C) { 178 return isLetter(C); 179 } 180 181 bool isHTMLIdentifierCharacter(char C) { 182 return isAlphanumeric(C); 183 } 184 185 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 186 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 187 if (!isHTMLIdentifierCharacter(*BufferPtr)) 188 return BufferPtr; 189 } 190 return BufferEnd; 191 } 192 193 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside 194 /// string allowed. 195 /// 196 /// Returns pointer to closing quote. 197 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 198 { 199 const char Quote = *BufferPtr; 200 assert(Quote == '\"' || Quote == '\''); 201 202 BufferPtr++; 203 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 204 const char C = *BufferPtr; 205 if (C == Quote && BufferPtr[-1] != '\\') 206 return BufferPtr; 207 } 208 return BufferEnd; 209 } 210 211 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 212 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 213 if (!isWhitespace(*BufferPtr)) 214 return BufferPtr; 215 } 216 return BufferEnd; 217 } 218 219 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 220 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 221 } 222 223 bool isCommandNameStartCharacter(char C) { 224 return isLetter(C); 225 } 226 227 bool isCommandNameCharacter(char C) { 228 return isAlphanumeric(C); 229 } 230 231 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 232 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 233 if (!isCommandNameCharacter(*BufferPtr)) 234 return BufferPtr; 235 } 236 return BufferEnd; 237 } 238 239 /// Return the one past end pointer for BCPL comments. 240 /// Handles newlines escaped with backslash or trigraph for backslahs. 241 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 242 const char *CurPtr = BufferPtr; 243 while (CurPtr != BufferEnd) { 244 while (!isVerticalWhitespace(*CurPtr)) { 245 CurPtr++; 246 if (CurPtr == BufferEnd) 247 return BufferEnd; 248 } 249 // We found a newline, check if it is escaped. 250 const char *EscapePtr = CurPtr - 1; 251 while(isHorizontalWhitespace(*EscapePtr)) 252 EscapePtr--; 253 254 if (*EscapePtr == '\\' || 255 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 256 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 257 // We found an escaped newline. 258 CurPtr = skipNewline(CurPtr, BufferEnd); 259 } else 260 return CurPtr; // Not an escaped newline. 261 } 262 return BufferEnd; 263 } 264 265 /// Return the one past end pointer for C comments. 266 /// Very dumb, does not handle escaped newlines or trigraphs. 267 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 268 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 269 if (*BufferPtr == '*') { 270 assert(BufferPtr + 1 != BufferEnd); 271 if (*(BufferPtr + 1) == '/') 272 return BufferPtr; 273 } 274 } 275 llvm_unreachable("buffer end hit before '*/' was seen"); 276 } 277 278 } // end anonymous namespace 279 280 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, 281 tok::TokenKind Kind) { 282 const unsigned TokLen = TokEnd - BufferPtr; 283 Result.setLocation(getSourceLocation(BufferPtr)); 284 Result.setKind(Kind); 285 Result.setLength(TokLen); 286 #ifndef NDEBUG 287 Result.TextPtr = "<UNSET>"; 288 Result.IntVal = 7; 289 #endif 290 BufferPtr = TokEnd; 291 } 292 293 void Lexer::lexCommentText(Token &T) { 294 assert(CommentState == LCS_InsideBCPLComment || 295 CommentState == LCS_InsideCComment); 296 297 switch (State) { 298 case LS_Normal: 299 break; 300 case LS_VerbatimBlockFirstLine: 301 lexVerbatimBlockFirstLine(T); 302 return; 303 case LS_VerbatimBlockBody: 304 lexVerbatimBlockBody(T); 305 return; 306 case LS_VerbatimLineText: 307 lexVerbatimLineText(T); 308 return; 309 case LS_HTMLStartTag: 310 lexHTMLStartTag(T); 311 return; 312 case LS_HTMLEndTag: 313 lexHTMLEndTag(T); 314 return; 315 } 316 317 assert(State == LS_Normal); 318 319 const char *TokenPtr = BufferPtr; 320 assert(TokenPtr < CommentEnd); 321 while (TokenPtr != CommentEnd) { 322 switch(*TokenPtr) { 323 case '\\': 324 case '@': { 325 // Commands that start with a backslash and commands that start with 326 // 'at' have equivalent semantics. But we keep information about the 327 // exact syntax in AST for comments. 328 tok::TokenKind CommandKind = 329 (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 330 TokenPtr++; 331 if (TokenPtr == CommentEnd) { 332 formTextToken(T, TokenPtr); 333 return; 334 } 335 char C = *TokenPtr; 336 switch (C) { 337 default: 338 break; 339 340 case '\\': case '@': case '&': case '$': 341 case '#': case '<': case '>': case '%': 342 case '\"': case '.': case ':': 343 // This is one of \\ \@ \& \$ etc escape sequences. 344 TokenPtr++; 345 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 346 // This is the \:: escape sequence. 347 TokenPtr++; 348 } 349 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 350 formTokenWithChars(T, TokenPtr, tok::text); 351 T.setText(UnescapedText); 352 return; 353 } 354 355 // Don't make zero-length commands. 356 if (!isCommandNameStartCharacter(*TokenPtr)) { 357 formTextToken(T, TokenPtr); 358 return; 359 } 360 361 TokenPtr = skipCommandName(TokenPtr, CommentEnd); 362 unsigned Length = TokenPtr - (BufferPtr + 1); 363 364 // Hardcoded support for lexing LaTeX formula commands 365 // \f$ \f[ \f] \f{ \f} as a single command. 366 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 367 C = *TokenPtr; 368 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 369 TokenPtr++; 370 Length++; 371 } 372 } 373 374 StringRef CommandName(BufferPtr + 1, Length); 375 376 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 377 if (!Info) { 378 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 379 StringRef CorrectedName = Info->Name; 380 SourceLocation Loc = getSourceLocation(BufferPtr); 381 SourceLocation EndLoc = getSourceLocation(TokenPtr); 382 SourceRange FullRange = SourceRange(Loc, EndLoc); 383 SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); 384 Diag(Loc, diag::warn_correct_comment_command_name) 385 << FullRange << CommandName << CorrectedName 386 << FixItHint::CreateReplacement(CommandRange, CorrectedName); 387 } else { 388 formTokenWithChars(T, TokenPtr, tok::unknown_command); 389 T.setUnknownCommandName(CommandName); 390 Diag(T.getLocation(), diag::warn_unknown_comment_command_name) 391 << SourceRange(T.getLocation(), T.getEndLocation()); 392 return; 393 } 394 } 395 if (Info->IsVerbatimBlockCommand) { 396 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 397 return; 398 } 399 if (Info->IsVerbatimLineCommand) { 400 setupAndLexVerbatimLine(T, TokenPtr, Info); 401 return; 402 } 403 formTokenWithChars(T, TokenPtr, CommandKind); 404 T.setCommandID(Info->getID()); 405 return; 406 } 407 408 case '&': 409 lexHTMLCharacterReference(T); 410 return; 411 412 case '<': { 413 TokenPtr++; 414 if (TokenPtr == CommentEnd) { 415 formTextToken(T, TokenPtr); 416 return; 417 } 418 const char C = *TokenPtr; 419 if (isHTMLIdentifierStartingCharacter(C)) 420 setupAndLexHTMLStartTag(T); 421 else if (C == '/') 422 setupAndLexHTMLEndTag(T); 423 else 424 formTextToken(T, TokenPtr); 425 return; 426 } 427 428 case '\n': 429 case '\r': 430 TokenPtr = skipNewline(TokenPtr, CommentEnd); 431 formTokenWithChars(T, TokenPtr, tok::newline); 432 433 if (CommentState == LCS_InsideCComment) 434 skipLineStartingDecorations(); 435 return; 436 437 default: { 438 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 439 find_first_of("\n\r\\@&<"); 440 if (End != StringRef::npos) 441 TokenPtr += End; 442 else 443 TokenPtr = CommentEnd; 444 formTextToken(T, TokenPtr); 445 return; 446 } 447 } 448 } 449 } 450 451 void Lexer::setupAndLexVerbatimBlock(Token &T, 452 const char *TextBegin, 453 char Marker, const CommandInfo *Info) { 454 assert(Info->IsVerbatimBlockCommand); 455 456 VerbatimBlockEndCommandName.clear(); 457 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 458 VerbatimBlockEndCommandName.append(Info->EndCommandName); 459 460 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 461 T.setVerbatimBlockID(Info->getID()); 462 463 // If there is a newline following the verbatim opening command, skip the 464 // newline so that we don't create an tok::verbatim_block_line with empty 465 // text content. 466 if (BufferPtr != CommentEnd && 467 isVerticalWhitespace(*BufferPtr)) { 468 BufferPtr = skipNewline(BufferPtr, CommentEnd); 469 State = LS_VerbatimBlockBody; 470 return; 471 } 472 473 State = LS_VerbatimBlockFirstLine; 474 } 475 476 void Lexer::lexVerbatimBlockFirstLine(Token &T) { 477 again: 478 assert(BufferPtr < CommentEnd); 479 480 // FIXME: It would be better to scan the text once, finding either the block 481 // end command or newline. 482 // 483 // Extract current line. 484 const char *Newline = findNewline(BufferPtr, CommentEnd); 485 StringRef Line(BufferPtr, Newline - BufferPtr); 486 487 // Look for end command in current line. 488 size_t Pos = Line.find(VerbatimBlockEndCommandName); 489 const char *TextEnd; 490 const char *NextLine; 491 if (Pos == StringRef::npos) { 492 // Current line is completely verbatim. 493 TextEnd = Newline; 494 NextLine = skipNewline(Newline, CommentEnd); 495 } else if (Pos == 0) { 496 // Current line contains just an end command. 497 const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 498 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 499 formTokenWithChars(T, End, tok::verbatim_block_end); 500 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 501 State = LS_Normal; 502 return; 503 } else { 504 // There is some text, followed by end command. Extract text first. 505 TextEnd = BufferPtr + Pos; 506 NextLine = TextEnd; 507 // If there is only whitespace before end command, skip whitespace. 508 if (isWhitespace(BufferPtr, TextEnd)) { 509 BufferPtr = TextEnd; 510 goto again; 511 } 512 } 513 514 StringRef Text(BufferPtr, TextEnd - BufferPtr); 515 formTokenWithChars(T, NextLine, tok::verbatim_block_line); 516 T.setVerbatimBlockText(Text); 517 518 State = LS_VerbatimBlockBody; 519 } 520 521 void Lexer::lexVerbatimBlockBody(Token &T) { 522 assert(State == LS_VerbatimBlockBody); 523 524 if (CommentState == LCS_InsideCComment) 525 skipLineStartingDecorations(); 526 527 if (BufferPtr == CommentEnd) { 528 formTokenWithChars(T, BufferPtr, tok::verbatim_block_line); 529 T.setVerbatimBlockText(""); 530 return; 531 } 532 533 lexVerbatimBlockFirstLine(T); 534 } 535 536 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 537 const CommandInfo *Info) { 538 assert(Info->IsVerbatimLineCommand); 539 formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 540 T.setVerbatimLineID(Info->getID()); 541 542 State = LS_VerbatimLineText; 543 } 544 545 void Lexer::lexVerbatimLineText(Token &T) { 546 assert(State == LS_VerbatimLineText); 547 548 // Extract current line. 549 const char *Newline = findNewline(BufferPtr, CommentEnd); 550 StringRef Text(BufferPtr, Newline - BufferPtr); 551 formTokenWithChars(T, Newline, tok::verbatim_line_text); 552 T.setVerbatimLineText(Text); 553 554 State = LS_Normal; 555 } 556 557 void Lexer::lexHTMLCharacterReference(Token &T) { 558 const char *TokenPtr = BufferPtr; 559 assert(*TokenPtr == '&'); 560 TokenPtr++; 561 if (TokenPtr == CommentEnd) { 562 formTextToken(T, TokenPtr); 563 return; 564 } 565 const char *NamePtr; 566 bool isNamed = false; 567 bool isDecimal = false; 568 char C = *TokenPtr; 569 if (isHTMLNamedCharacterReferenceCharacter(C)) { 570 NamePtr = TokenPtr; 571 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 572 isNamed = true; 573 } else if (C == '#') { 574 TokenPtr++; 575 if (TokenPtr == CommentEnd) { 576 formTextToken(T, TokenPtr); 577 return; 578 } 579 C = *TokenPtr; 580 if (isHTMLDecimalCharacterReferenceCharacter(C)) { 581 NamePtr = TokenPtr; 582 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 583 isDecimal = true; 584 } else if (C == 'x' || C == 'X') { 585 TokenPtr++; 586 NamePtr = TokenPtr; 587 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 588 } else { 589 formTextToken(T, TokenPtr); 590 return; 591 } 592 } else { 593 formTextToken(T, TokenPtr); 594 return; 595 } 596 if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 597 *TokenPtr != ';') { 598 formTextToken(T, TokenPtr); 599 return; 600 } 601 StringRef Name(NamePtr, TokenPtr - NamePtr); 602 TokenPtr++; // Skip semicolon. 603 StringRef Resolved; 604 if (isNamed) 605 Resolved = resolveHTMLNamedCharacterReference(Name); 606 else if (isDecimal) 607 Resolved = resolveHTMLDecimalCharacterReference(Name); 608 else 609 Resolved = resolveHTMLHexCharacterReference(Name); 610 611 if (Resolved.empty()) { 612 formTextToken(T, TokenPtr); 613 return; 614 } 615 formTokenWithChars(T, TokenPtr, tok::text); 616 T.setText(Resolved); 617 } 618 619 void Lexer::setupAndLexHTMLStartTag(Token &T) { 620 assert(BufferPtr[0] == '<' && 621 isHTMLIdentifierStartingCharacter(BufferPtr[1])); 622 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 623 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 624 if (!isHTMLTagName(Name)) { 625 formTextToken(T, TagNameEnd); 626 return; 627 } 628 629 formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 630 T.setHTMLTagStartName(Name); 631 632 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 633 634 const char C = *BufferPtr; 635 if (BufferPtr != CommentEnd && 636 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 637 State = LS_HTMLStartTag; 638 } 639 640 void Lexer::lexHTMLStartTag(Token &T) { 641 assert(State == LS_HTMLStartTag); 642 643 const char *TokenPtr = BufferPtr; 644 char C = *TokenPtr; 645 if (isHTMLIdentifierCharacter(C)) { 646 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 647 StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 648 formTokenWithChars(T, TokenPtr, tok::html_ident); 649 T.setHTMLIdent(Ident); 650 } else { 651 switch (C) { 652 case '=': 653 TokenPtr++; 654 formTokenWithChars(T, TokenPtr, tok::html_equals); 655 break; 656 case '\"': 657 case '\'': { 658 const char *OpenQuote = TokenPtr; 659 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 660 const char *ClosingQuote = TokenPtr; 661 if (TokenPtr != CommentEnd) // Skip closing quote. 662 TokenPtr++; 663 formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 664 T.setHTMLQuotedString(StringRef(OpenQuote + 1, 665 ClosingQuote - (OpenQuote + 1))); 666 break; 667 } 668 case '>': 669 TokenPtr++; 670 formTokenWithChars(T, TokenPtr, tok::html_greater); 671 State = LS_Normal; 672 return; 673 case '/': 674 TokenPtr++; 675 if (TokenPtr != CommentEnd && *TokenPtr == '>') { 676 TokenPtr++; 677 formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 678 } else 679 formTextToken(T, TokenPtr); 680 681 State = LS_Normal; 682 return; 683 } 684 } 685 686 // Now look ahead and return to normal state if we don't see any HTML tokens 687 // ahead. 688 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 689 if (BufferPtr == CommentEnd) { 690 State = LS_Normal; 691 return; 692 } 693 694 C = *BufferPtr; 695 if (!isHTMLIdentifierStartingCharacter(C) && 696 C != '=' && C != '\"' && C != '\'' && C != '>') { 697 State = LS_Normal; 698 return; 699 } 700 } 701 702 void Lexer::setupAndLexHTMLEndTag(Token &T) { 703 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 704 705 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 706 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 707 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 708 if (!isHTMLTagName(Name)) { 709 formTextToken(T, TagNameEnd); 710 return; 711 } 712 713 const char *End = skipWhitespace(TagNameEnd, CommentEnd); 714 715 formTokenWithChars(T, End, tok::html_end_tag); 716 T.setHTMLTagEndName(Name); 717 718 if (BufferPtr != CommentEnd && *BufferPtr == '>') 719 State = LS_HTMLEndTag; 720 } 721 722 void Lexer::lexHTMLEndTag(Token &T) { 723 assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 724 725 formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 726 State = LS_Normal; 727 } 728 729 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 730 const CommandTraits &Traits, 731 SourceLocation FileLoc, 732 const char *BufferStart, const char *BufferEnd): 733 Allocator(Allocator), Diags(Diags), Traits(Traits), 734 BufferStart(BufferStart), BufferEnd(BufferEnd), 735 FileLoc(FileLoc), BufferPtr(BufferStart), 736 CommentState(LCS_BeforeComment), State(LS_Normal) { 737 } 738 739 void Lexer::lex(Token &T) { 740 again: 741 switch (CommentState) { 742 case LCS_BeforeComment: 743 if (BufferPtr == BufferEnd) { 744 formTokenWithChars(T, BufferPtr, tok::eof); 745 return; 746 } 747 748 assert(*BufferPtr == '/'); 749 BufferPtr++; // Skip first slash. 750 switch(*BufferPtr) { 751 case '/': { // BCPL comment. 752 BufferPtr++; // Skip second slash. 753 754 if (BufferPtr != BufferEnd) { 755 // Skip Doxygen magic marker, if it is present. 756 // It might be missing because of a typo //< or /*<, or because we 757 // merged this non-Doxygen comment into a bunch of Doxygen comments 758 // around it: /** ... */ /* ... */ /** ... */ 759 const char C = *BufferPtr; 760 if (C == '/' || C == '!') 761 BufferPtr++; 762 } 763 764 // Skip less-than symbol that marks trailing comments. 765 // Skip it even if the comment is not a Doxygen one, because //< and /*< 766 // are frequent typos. 767 if (BufferPtr != BufferEnd && *BufferPtr == '<') 768 BufferPtr++; 769 770 CommentState = LCS_InsideBCPLComment; 771 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 772 State = LS_Normal; 773 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 774 goto again; 775 } 776 case '*': { // C comment. 777 BufferPtr++; // Skip star. 778 779 // Skip Doxygen magic marker. 780 const char C = *BufferPtr; 781 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 782 BufferPtr++; 783 784 // Skip less-than symbol that marks trailing comments. 785 if (BufferPtr != BufferEnd && *BufferPtr == '<') 786 BufferPtr++; 787 788 CommentState = LCS_InsideCComment; 789 State = LS_Normal; 790 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 791 goto again; 792 } 793 default: 794 llvm_unreachable("second character of comment should be '/' or '*'"); 795 } 796 797 case LCS_BetweenComments: { 798 // Consecutive comments are extracted only if there is only whitespace 799 // between them. So we can search for the start of the next comment. 800 const char *EndWhitespace = BufferPtr; 801 while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 802 EndWhitespace++; 803 804 // Turn any whitespace between comments (and there is only whitespace 805 // between them -- guaranteed by comment extraction) into a newline. We 806 // have two newlines between C comments in total (first one was synthesized 807 // after a comment). 808 formTokenWithChars(T, EndWhitespace, tok::newline); 809 810 CommentState = LCS_BeforeComment; 811 break; 812 } 813 814 case LCS_InsideBCPLComment: 815 case LCS_InsideCComment: 816 if (BufferPtr != CommentEnd) { 817 lexCommentText(T); 818 break; 819 } else { 820 // Skip C comment closing sequence. 821 if (CommentState == LCS_InsideCComment) { 822 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 823 BufferPtr += 2; 824 assert(BufferPtr <= BufferEnd); 825 826 // Synthenize newline just after the C comment, regardless if there is 827 // actually a newline. 828 formTokenWithChars(T, BufferPtr, tok::newline); 829 830 CommentState = LCS_BetweenComments; 831 break; 832 } else { 833 // Don't synthesized a newline after BCPL comment. 834 CommentState = LCS_BetweenComments; 835 goto again; 836 } 837 } 838 } 839 } 840 841 StringRef Lexer::getSpelling(const Token &Tok, 842 const SourceManager &SourceMgr, 843 bool *Invalid) const { 844 SourceLocation Loc = Tok.getLocation(); 845 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 846 847 bool InvalidTemp = false; 848 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 849 if (InvalidTemp) { 850 *Invalid = true; 851 return StringRef(); 852 } 853 854 const char *Begin = File.data() + LocInfo.second; 855 return StringRef(Begin, Tok.getLength()); 856 } 857 858 } // end namespace comments 859 } // end namespace clang 860