1 #include "clang/AST/CommentLexer.h"
2 #include "clang/Basic/ConvertUTF.h"
3 #include "llvm/ADT/StringSwitch.h"
4 #include "llvm/Support/ErrorHandling.h"
5 
6 namespace clang {
7 namespace comments {
8 
9 void Token::dump(const Lexer &L, const SourceManager &SM) const {
10   llvm::errs() << "comments::Token Kind=" << Kind << " ";
11   Loc.dump(SM);
12   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
13 }
14 
15 bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
16                                   StringRef &EndName) const {
17   const char *Result = llvm::StringSwitch<const char *>(BeginName)
18     .Case("code", "endcode")
19     .Case("verbatim", "endverbatim")
20     .Case("htmlonly", "endhtmlonly")
21     .Case("latexonly", "endlatexonly")
22     .Case("xmlonly", "endxmlonly")
23     .Case("manonly", "endmanonly")
24     .Case("rtfonly", "endrtfonly")
25 
26     .Case("dot", "enddot")
27     .Case("msc", "endmsc")
28 
29     .Case("f$", "f$") // Inline LaTeX formula
30     .Case("f[", "f]") // Displayed LaTeX formula
31     .Case("f{", "f}") // LaTeX environment
32 
33     .Default(NULL);
34 
35   if (Result) {
36     EndName = Result;
37     return true;
38   }
39 
40   for (VerbatimBlockCommandVector::const_iterator
41            I = VerbatimBlockCommands.begin(),
42            E = VerbatimBlockCommands.end();
43        I != E; ++I)
44     if (I->BeginName == BeginName) {
45       EndName = I->EndName;
46       return true;
47     }
48 
49   return false;
50 }
51 
52 bool Lexer::isVerbatimLineCommand(StringRef Name) const {
53   bool Result = llvm::StringSwitch<bool>(Name)
54   .Case("fn", true)
55   .Case("var", true)
56   .Case("property", true)
57   .Case("typedef", true)
58 
59   .Case("overload", true)
60 
61   .Case("defgroup", true)
62   .Case("ingroup", true)
63   .Case("addtogroup", true)
64   .Case("weakgroup", true)
65   .Case("name", true)
66 
67   .Case("section", true)
68   .Case("subsection", true)
69   .Case("subsubsection", true)
70   .Case("paragraph", true)
71 
72   .Case("mainpage", true)
73   .Case("subpage", true)
74   .Case("ref", true)
75 
76   .Default(false);
77 
78   if (Result)
79     return true;
80 
81   for (VerbatimLineCommandVector::const_iterator
82            I = VerbatimLineCommands.begin(),
83            E = VerbatimLineCommands.end();
84        I != E; ++I)
85     if (I->Name == Name)
86       return true;
87 
88   return false;
89 }
90 
91 namespace {
92 bool isHTMLNamedCharacterReferenceCharacter(char C) {
93   return (C >= 'a' && C <= 'z') ||
94          (C >= 'A' && C <= 'Z');
95 }
96 
97 bool isHTMLDecimalCharacterReferenceCharacter(char C) {
98   return C >= '0' && C <= '9';
99 }
100 
101 bool isHTMLHexCharacterReferenceCharacter(char C) {
102   return (C >= '0' && C <= '9') ||
103          (C >= 'a' && C <= 'f') ||
104          (C >= 'A' && C <= 'F');
105 }
106 } // unnamed namespace
107 
108 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
109   return llvm::StringSwitch<StringRef>(Name)
110       .Case("amp", "&")
111       .Case("lt", "<")
112       .Case("gt", ">")
113       .Case("quot", "\"")
114       .Case("apos", "\'")
115       .Default("");
116 }
117 
118 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
119   unsigned CodePoint = 0;
120   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
121     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
122     CodePoint *= 10;
123     CodePoint += Name[i] - '0';
124   }
125 
126   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
127   char *ResolvedPtr = Resolved;
128   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
129     return StringRef(Resolved, ResolvedPtr - Resolved);
130   else
131     return StringRef();
132 }
133 
134 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
135   unsigned CodePoint = 0;
136   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
137     CodePoint *= 16;
138     const char C = Name[i];
139     assert(isHTMLHexCharacterReferenceCharacter(C));
140     if (C >= '0' && C <= '9')
141       CodePoint += Name[i] - '0';
142     else if (C >= 'a' && C <= 'f')
143       CodePoint += Name[i] - 'a' + 10;
144     else
145       CodePoint += Name[i] - 'A' + 10;
146   }
147 
148   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
149   char *ResolvedPtr = Resolved;
150   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
151     return StringRef(Resolved, ResolvedPtr - Resolved);
152   else
153     return StringRef();
154 }
155 
156 void Lexer::skipLineStartingDecorations() {
157   // This function should be called only for C comments
158   assert(CommentState == LCS_InsideCComment);
159 
160   if (BufferPtr == CommentEnd)
161     return;
162 
163   switch (*BufferPtr) {
164   case ' ':
165   case '\t':
166   case '\f':
167   case '\v': {
168     const char *NewBufferPtr = BufferPtr;
169     NewBufferPtr++;
170     if (NewBufferPtr == CommentEnd)
171       return;
172 
173     char C = *NewBufferPtr;
174     while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
175       NewBufferPtr++;
176       if (NewBufferPtr == CommentEnd)
177         return;
178       C = *NewBufferPtr;
179     }
180     if (C == '*')
181       BufferPtr = NewBufferPtr + 1;
182     break;
183   }
184   case '*':
185     BufferPtr++;
186     break;
187   }
188 }
189 
190 namespace {
191 /// Returns pointer to the first newline character in the string.
192 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
193   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
194     const char C = *BufferPtr;
195     if (C == '\n' || C == '\r')
196       return BufferPtr;
197   }
198   return BufferEnd;
199 }
200 
201 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
202   if (BufferPtr == BufferEnd)
203     return BufferPtr;
204 
205   if (*BufferPtr == '\n')
206     BufferPtr++;
207   else {
208     assert(*BufferPtr == '\r');
209     BufferPtr++;
210     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
211       BufferPtr++;
212   }
213   return BufferPtr;
214 }
215 
216 const char *skipNamedCharacterReference(const char *BufferPtr,
217                                         const char *BufferEnd) {
218   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
220       return BufferPtr;
221   }
222   return BufferEnd;
223 }
224 
225 const char *skipDecimalCharacterReference(const char *BufferPtr,
226                                           const char *BufferEnd) {
227   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
228     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
229       return BufferPtr;
230   }
231   return BufferEnd;
232 }
233 
234 const char *skipHexCharacterReference(const char *BufferPtr,
235                                           const char *BufferEnd) {
236   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
237     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
238       return BufferPtr;
239   }
240   return BufferEnd;
241 }
242 
243 bool isHTMLIdentifierStartingCharacter(char C) {
244   return (C >= 'a' && C <= 'z') ||
245          (C >= 'A' && C <= 'Z');
246 }
247 
248 bool isHTMLIdentifierCharacter(char C) {
249   return (C >= 'a' && C <= 'z') ||
250          (C >= 'A' && C <= 'Z') ||
251          (C >= '0' && C <= '9');
252 }
253 
254 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
255   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
256     if (!isHTMLIdentifierCharacter(*BufferPtr))
257       return BufferPtr;
258   }
259   return BufferEnd;
260 }
261 
262 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
263 /// string allowed.
264 ///
265 /// Returns pointer to closing quote.
266 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
267 {
268   const char Quote = *BufferPtr;
269   assert(Quote == '\"' || Quote == '\'');
270 
271   BufferPtr++;
272   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
273     const char C = *BufferPtr;
274     if (C == Quote && BufferPtr[-1] != '\\')
275       return BufferPtr;
276   }
277   return BufferEnd;
278 }
279 
280 bool isHorizontalWhitespace(char C) {
281   return C == ' ' || C == '\t' || C == '\f' || C == '\v';
282 }
283 
284 bool isWhitespace(char C) {
285   return C == ' ' || C == '\n' || C == '\r' ||
286          C == '\t' || C == '\f' || C == '\v';
287 }
288 
289 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
290   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
291     if (!isWhitespace(*BufferPtr))
292       return BufferPtr;
293   }
294   return BufferEnd;
295 }
296 
297 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
298   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
299 }
300 
301 bool isCommandNameCharacter(char C) {
302   return (C >= 'a' && C <= 'z') ||
303          (C >= 'A' && C <= 'Z') ||
304          (C >= '0' && C <= '9');
305 }
306 
307 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
308   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
309     if (!isCommandNameCharacter(*BufferPtr))
310       return BufferPtr;
311   }
312   return BufferEnd;
313 }
314 
315 /// Return the one past end pointer for BCPL comments.
316 /// Handles newlines escaped with backslash or trigraph for backslahs.
317 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
318   const char *CurPtr = BufferPtr;
319   while (CurPtr != BufferEnd) {
320     char C = *CurPtr;
321     while (C != '\n' && C != '\r') {
322       CurPtr++;
323       if (CurPtr == BufferEnd)
324         return BufferEnd;
325       C = *CurPtr;
326     }
327     // We found a newline, check if it is escaped.
328     const char *EscapePtr = CurPtr - 1;
329     while(isHorizontalWhitespace(*EscapePtr))
330       EscapePtr--;
331 
332     if (*EscapePtr == '\\' ||
333         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
334          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
335       // We found an escaped newline.
336       CurPtr = skipNewline(CurPtr, BufferEnd);
337     } else
338       return CurPtr; // Not an escaped newline.
339   }
340   return BufferEnd;
341 }
342 
343 /// Return the one past end pointer for C comments.
344 /// Very dumb, does not handle escaped newlines or trigraphs.
345 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
346   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
347     if (*BufferPtr == '*') {
348       assert(BufferPtr + 1 != BufferEnd);
349       if (*(BufferPtr + 1) == '/')
350         return BufferPtr;
351     }
352   }
353   llvm_unreachable("buffer end hit before '*/' was seen");
354 }
355 } // unnamed namespace
356 
357 void Lexer::lexCommentText(Token &T) {
358   assert(CommentState == LCS_InsideBCPLComment ||
359          CommentState == LCS_InsideCComment);
360 
361   switch (State) {
362   case LS_Normal:
363     break;
364   case LS_VerbatimBlockFirstLine:
365     lexVerbatimBlockFirstLine(T);
366     return;
367   case LS_VerbatimBlockBody:
368     lexVerbatimBlockBody(T);
369     return;
370   case LS_VerbatimLineText:
371     lexVerbatimLineText(T);
372     return;
373   case LS_HTMLStartTag:
374     lexHTMLStartTag(T);
375     return;
376   case LS_HTMLEndTag:
377     lexHTMLEndTag(T);
378     return;
379   }
380 
381   assert(State == LS_Normal);
382 
383   const char *TokenPtr = BufferPtr;
384   assert(TokenPtr < CommentEnd);
385   while (TokenPtr != CommentEnd) {
386     switch(*TokenPtr) {
387       case '\\':
388       case '@': {
389         TokenPtr++;
390         if (TokenPtr == CommentEnd) {
391           formTextToken(T, TokenPtr);
392           return;
393         }
394         char C = *TokenPtr;
395         switch (C) {
396         default:
397           break;
398 
399         case '\\': case '@': case '&': case '$':
400         case '#':  case '<': case '>': case '%':
401         case '\"': case '.': case ':':
402           // This is one of \\ \@ \& \$ etc escape sequences.
403           TokenPtr++;
404           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
405             // This is the \:: escape sequence.
406             TokenPtr++;
407           }
408           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
409           formTokenWithChars(T, TokenPtr, tok::text);
410           T.setText(UnescapedText);
411           return;
412         }
413 
414         // Don't make zero-length commands.
415         if (!isCommandNameCharacter(*TokenPtr)) {
416           formTextToken(T, TokenPtr);
417           return;
418         }
419 
420         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
421         unsigned Length = TokenPtr - (BufferPtr + 1);
422 
423         // Hardcoded support for lexing LaTeX formula commands
424         // \f$ \f[ \f] \f{ \f} as a single command.
425         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
426           C = *TokenPtr;
427           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
428             TokenPtr++;
429             Length++;
430           }
431         }
432 
433         const StringRef CommandName(BufferPtr + 1, Length);
434         StringRef EndName;
435 
436         if (isVerbatimBlockCommand(CommandName, EndName)) {
437           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
438           return;
439         }
440         if (isVerbatimLineCommand(CommandName)) {
441           setupAndLexVerbatimLine(T, TokenPtr);
442           return;
443         }
444         formTokenWithChars(T, TokenPtr, tok::command);
445         T.setCommandName(CommandName);
446         return;
447       }
448 
449       case '&':
450         lexHTMLCharacterReference(T);
451         return;
452 
453       case '<': {
454         TokenPtr++;
455         if (TokenPtr == CommentEnd) {
456           formTextToken(T, TokenPtr);
457           return;
458         }
459         const char C = *TokenPtr;
460         if (isHTMLIdentifierStartingCharacter(C))
461           setupAndLexHTMLStartTag(T);
462         else if (C == '/')
463           setupAndLexHTMLEndTag(T);
464         else
465           formTextToken(T, TokenPtr);
466 
467         return;
468       }
469 
470       case '\n':
471       case '\r':
472         TokenPtr = skipNewline(TokenPtr, CommentEnd);
473         formTokenWithChars(T, TokenPtr, tok::newline);
474 
475         if (CommentState == LCS_InsideCComment)
476           skipLineStartingDecorations();
477         return;
478 
479       default: {
480         while (true) {
481           TokenPtr++;
482           if (TokenPtr == CommentEnd)
483             break;
484           const char C = *TokenPtr;
485           if(C == '\n' || C == '\r' ||
486              C == '\\' || C == '@' || C == '&' || C == '<')
487             break;
488         }
489         formTextToken(T, TokenPtr);
490         return;
491       }
492     }
493   }
494 }
495 
496 void Lexer::setupAndLexVerbatimBlock(Token &T,
497                                      const char *TextBegin,
498                                      char Marker, StringRef EndName) {
499   VerbatimBlockEndCommandName.clear();
500   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
501   VerbatimBlockEndCommandName.append(EndName);
502 
503   StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
504   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
505   T.setVerbatimBlockName(Name);
506 
507   // If there is a newline following the verbatim opening command, skip the
508   // newline so that we don't create an tok::verbatim_block_line with empty
509   // text content.
510   if (BufferPtr != CommentEnd) {
511     const char C = *BufferPtr;
512     if (C == '\n' || C == '\r') {
513       BufferPtr = skipNewline(BufferPtr, CommentEnd);
514       State = LS_VerbatimBlockBody;
515       return;
516     }
517   }
518 
519   State = LS_VerbatimBlockFirstLine;
520 }
521 
522 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
523 again:
524   assert(BufferPtr < CommentEnd);
525 
526   // FIXME: It would be better to scan the text once, finding either the block
527   // end command or newline.
528   //
529   // Extract current line.
530   const char *Newline = findNewline(BufferPtr, CommentEnd);
531   StringRef Line(BufferPtr, Newline - BufferPtr);
532 
533   // Look for end command in current line.
534   size_t Pos = Line.find(VerbatimBlockEndCommandName);
535   const char *TextEnd;
536   const char *NextLine;
537   if (Pos == StringRef::npos) {
538     // Current line is completely verbatim.
539     TextEnd = Newline;
540     NextLine = skipNewline(Newline, CommentEnd);
541   } else if (Pos == 0) {
542     // Current line contains just an end command.
543     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
544     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
545     formTokenWithChars(T, End, tok::verbatim_block_end);
546     T.setVerbatimBlockName(Name);
547     State = LS_Normal;
548     return;
549   } else {
550     // There is some text, followed by end command.  Extract text first.
551     TextEnd = BufferPtr + Pos;
552     NextLine = TextEnd;
553     // If there is only whitespace before end command, skip whitespace.
554     if (isWhitespace(BufferPtr, TextEnd)) {
555       BufferPtr = TextEnd;
556       goto again;
557     }
558   }
559 
560   StringRef Text(BufferPtr, TextEnd - BufferPtr);
561   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
562   T.setVerbatimBlockText(Text);
563 
564   State = LS_VerbatimBlockBody;
565 }
566 
567 void Lexer::lexVerbatimBlockBody(Token &T) {
568   assert(State == LS_VerbatimBlockBody);
569 
570   if (CommentState == LCS_InsideCComment)
571     skipLineStartingDecorations();
572 
573   lexVerbatimBlockFirstLine(T);
574 }
575 
576 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
577   const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
578   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
579   T.setVerbatimLineName(Name);
580 
581   State = LS_VerbatimLineText;
582 }
583 
584 void Lexer::lexVerbatimLineText(Token &T) {
585   assert(State == LS_VerbatimLineText);
586 
587   // Extract current line.
588   const char *Newline = findNewline(BufferPtr, CommentEnd);
589   const StringRef Text(BufferPtr, Newline - BufferPtr);
590   formTokenWithChars(T, Newline, tok::verbatim_line_text);
591   T.setVerbatimLineText(Text);
592 
593   State = LS_Normal;
594 }
595 
596 void Lexer::lexHTMLCharacterReference(Token &T) {
597   const char *TokenPtr = BufferPtr;
598   assert(*TokenPtr == '&');
599   TokenPtr++;
600   if (TokenPtr == CommentEnd) {
601     formTextToken(T, TokenPtr);
602     return;
603   }
604   const char *NamePtr;
605   bool isNamed = false;
606   bool isDecimal = false;
607   char C = *TokenPtr;
608   if (isHTMLNamedCharacterReferenceCharacter(C)) {
609     NamePtr = TokenPtr;
610     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
611     isNamed = true;
612   } else if (C == '#') {
613     TokenPtr++;
614     if (TokenPtr == CommentEnd) {
615       formTextToken(T, TokenPtr);
616       return;
617     }
618     C = *TokenPtr;
619     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
620       NamePtr = TokenPtr;
621       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
622       isDecimal = true;
623     } else if (C == 'x' || C == 'X') {
624       TokenPtr++;
625       NamePtr = TokenPtr;
626       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
627     } else {
628       formTextToken(T, TokenPtr);
629       return;
630     }
631   } else {
632     formTextToken(T, TokenPtr);
633     return;
634   }
635   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
636       *TokenPtr != ';') {
637     formTextToken(T, TokenPtr);
638     return;
639   }
640   StringRef Name(NamePtr, TokenPtr - NamePtr);
641   TokenPtr++; // Skip semicolon.
642   StringRef Resolved;
643   if (isNamed)
644     Resolved = resolveHTMLNamedCharacterReference(Name);
645   else if (isDecimal)
646     Resolved = resolveHTMLDecimalCharacterReference(Name);
647   else
648     Resolved = resolveHTMLHexCharacterReference(Name);
649 
650   if (Resolved.empty()) {
651     formTextToken(T, TokenPtr);
652     return;
653   }
654   formTokenWithChars(T, TokenPtr, tok::text);
655   T.setText(Resolved);
656   return;
657 }
658 
659 void Lexer::setupAndLexHTMLStartTag(Token &T) {
660   assert(BufferPtr[0] == '<' &&
661          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
662   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
663 
664   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
665   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
666   T.setHTMLTagStartName(Name);
667 
668   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
669 
670   const char C = *BufferPtr;
671   if (BufferPtr != CommentEnd &&
672       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
673     State = LS_HTMLStartTag;
674 }
675 
676 void Lexer::lexHTMLStartTag(Token &T) {
677   assert(State == LS_HTMLStartTag);
678 
679   const char *TokenPtr = BufferPtr;
680   char C = *TokenPtr;
681   if (isHTMLIdentifierCharacter(C)) {
682     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
683     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
684     formTokenWithChars(T, TokenPtr, tok::html_ident);
685     T.setHTMLIdent(Ident);
686   } else {
687     switch (C) {
688     case '=':
689       TokenPtr++;
690       formTokenWithChars(T, TokenPtr, tok::html_equals);
691       break;
692     case '\"':
693     case '\'': {
694       const char *OpenQuote = TokenPtr;
695       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
696       const char *ClosingQuote = TokenPtr;
697       if (TokenPtr != CommentEnd) // Skip closing quote.
698         TokenPtr++;
699       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
700       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
701                                       ClosingQuote - (OpenQuote + 1)));
702       break;
703     }
704     case '>':
705       TokenPtr++;
706       formTokenWithChars(T, TokenPtr, tok::html_greater);
707       State = LS_Normal;
708       return;
709     case '/':
710       TokenPtr++;
711       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
712         TokenPtr++;
713         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
714       } else
715         formTextToken(T, TokenPtr);
716 
717       State = LS_Normal;
718       return;
719     }
720   }
721 
722   // Now look ahead and return to normal state if we don't see any HTML tokens
723   // ahead.
724   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
725   if (BufferPtr == CommentEnd) {
726     State = LS_Normal;
727     return;
728   }
729 
730   C = *BufferPtr;
731   if (!isHTMLIdentifierStartingCharacter(C) &&
732       C != '=' && C != '\"' && C != '\'' && C != '>') {
733     State = LS_Normal;
734     return;
735   }
736 }
737 
738 void Lexer::setupAndLexHTMLEndTag(Token &T) {
739   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
740 
741   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
742   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
743 
744   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
745 
746   formTokenWithChars(T, End, tok::html_end_tag);
747   T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
748 
749   if (BufferPtr != CommentEnd && *BufferPtr == '>')
750     State = LS_HTMLEndTag;
751 }
752 
753 void Lexer::lexHTMLEndTag(Token &T) {
754   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
755 
756   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
757   State = LS_Normal;
758 }
759 
760 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator,
761              SourceLocation FileLoc, const CommentOptions &CommOpts,
762              const char *BufferStart, const char *BufferEnd):
763     Allocator(Allocator),
764     BufferStart(BufferStart), BufferEnd(BufferEnd),
765     FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
766     CommentState(LCS_BeforeComment), State(LS_Normal) {
767 }
768 
769 void Lexer::lex(Token &T) {
770 again:
771   switch (CommentState) {
772   case LCS_BeforeComment:
773     if (BufferPtr == BufferEnd) {
774       formTokenWithChars(T, BufferPtr, tok::eof);
775       return;
776     }
777 
778     assert(*BufferPtr == '/');
779     BufferPtr++; // Skip first slash.
780     switch(*BufferPtr) {
781     case '/': { // BCPL comment.
782       BufferPtr++; // Skip second slash.
783 
784       if (BufferPtr != BufferEnd) {
785         // Skip Doxygen magic marker, if it is present.
786         // It might be missing because of a typo //< or /*<, or because we
787         // merged this non-Doxygen comment into a bunch of Doxygen comments
788         // around it: /** ... */ /* ... */ /** ... */
789         const char C = *BufferPtr;
790         if (C == '/' || C == '!')
791           BufferPtr++;
792       }
793 
794       // Skip less-than symbol that marks trailing comments.
795       // Skip it even if the comment is not a Doxygen one, because //< and /*<
796       // are frequent typos.
797       if (BufferPtr != BufferEnd && *BufferPtr == '<')
798         BufferPtr++;
799 
800       CommentState = LCS_InsideBCPLComment;
801       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
802         State = LS_Normal;
803       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
804       goto again;
805     }
806     case '*': { // C comment.
807       BufferPtr++; // Skip star.
808 
809       // Skip Doxygen magic marker.
810       const char C = *BufferPtr;
811       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
812         BufferPtr++;
813 
814       // Skip less-than symbol that marks trailing comments.
815       if (BufferPtr != BufferEnd && *BufferPtr == '<')
816         BufferPtr++;
817 
818       CommentState = LCS_InsideCComment;
819       State = LS_Normal;
820       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
821       goto again;
822     }
823     default:
824       llvm_unreachable("second character of comment should be '/' or '*'");
825     }
826 
827   case LCS_BetweenComments: {
828     // Consecutive comments are extracted only if there is only whitespace
829     // between them.  So we can search for the start of the next comment.
830     const char *EndWhitespace = BufferPtr;
831     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
832       EndWhitespace++;
833 
834     // Turn any whitespace between comments (and there is only whitespace
835     // between them -- guaranteed by comment extraction) into a newline.  We
836     // have two newlines between C comments in total (first one was synthesized
837     // after a comment).
838     formTokenWithChars(T, EndWhitespace, tok::newline);
839 
840     CommentState = LCS_BeforeComment;
841     break;
842   }
843 
844   case LCS_InsideBCPLComment:
845   case LCS_InsideCComment:
846     if (BufferPtr != CommentEnd) {
847       lexCommentText(T);
848       break;
849     } else {
850       // Skip C comment closing sequence.
851       if (CommentState == LCS_InsideCComment) {
852         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
853         BufferPtr += 2;
854         assert(BufferPtr <= BufferEnd);
855 
856         // Synthenize newline just after the C comment, regardless if there is
857         // actually a newline.
858         formTokenWithChars(T, BufferPtr, tok::newline);
859 
860         CommentState = LCS_BetweenComments;
861         break;
862       } else {
863         // Don't synthesized a newline after BCPL comment.
864         CommentState = LCS_BetweenComments;
865         goto again;
866       }
867     }
868   }
869 }
870 
871 StringRef Lexer::getSpelling(const Token &Tok,
872                              const SourceManager &SourceMgr,
873                              bool *Invalid) const {
874   SourceLocation Loc = Tok.getLocation();
875   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
876 
877   bool InvalidTemp = false;
878   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
879   if (InvalidTemp) {
880     *Invalid = true;
881     return StringRef();
882   }
883 
884   const char *Begin = File.data() + LocInfo.second;
885   return StringRef(Begin, Tok.getLength());
886 }
887 
888 void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
889   VerbatimBlockCommand VBC;
890   VBC.BeginName = BeginName;
891   VBC.EndName = EndName;
892   VerbatimBlockCommands.push_back(VBC);
893 }
894 
895 void Lexer::addVerbatimLineCommand(StringRef Name) {
896   VerbatimLineCommand VLC;
897   VLC.Name = Name;
898   VerbatimLineCommands.push_back(VLC);
899 }
900 
901 } // end namespace comments
902 } // end namespace clang
903 
904