1 #include "clang/AST/CommentLexer.h"
2 #include "clang/AST/CommentCommandTraits.h"
3 #include "clang/Basic/ConvertUTF.h"
4 #include "llvm/ADT/StringSwitch.h"
5 #include "llvm/Support/ErrorHandling.h"
6 
7 namespace clang {
8 namespace comments {
9 
10 void Token::dump(const Lexer &L, const SourceManager &SM) const {
11   llvm::errs() << "comments::Token Kind=" << Kind << " ";
12   Loc.dump(SM);
13   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
14 }
15 
16 namespace {
17 bool isHTMLNamedCharacterReferenceCharacter(char C) {
18   return (C >= 'a' && C <= 'z') ||
19          (C >= 'A' && C <= 'Z');
20 }
21 
22 bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23   return C >= '0' && C <= '9';
24 }
25 
26 bool isHTMLHexCharacterReferenceCharacter(char C) {
27   return (C >= '0' && C <= '9') ||
28          (C >= 'a' && C <= 'f') ||
29          (C >= 'A' && C <= 'F');
30 }
31 
32 #include "clang/AST/CommentHTMLTags.inc"
33 
34 } // unnamed namespace
35 
36 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
37   return llvm::StringSwitch<StringRef>(Name)
38       .Case("amp", "&")
39       .Case("lt", "<")
40       .Case("gt", ">")
41       .Case("quot", "\"")
42       .Case("apos", "\'")
43       .Default("");
44 }
45 
46 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
47   unsigned CodePoint = 0;
48   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
49     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
50     CodePoint *= 10;
51     CodePoint += Name[i] - '0';
52   }
53 
54   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
55   char *ResolvedPtr = Resolved;
56   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
57     return StringRef(Resolved, ResolvedPtr - Resolved);
58   else
59     return StringRef();
60 }
61 
62 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
63   unsigned CodePoint = 0;
64   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
65     CodePoint *= 16;
66     const char C = Name[i];
67     assert(isHTMLHexCharacterReferenceCharacter(C));
68     if (C >= '0' && C <= '9')
69       CodePoint += Name[i] - '0';
70     else if (C >= 'a' && C <= 'f')
71       CodePoint += Name[i] - 'a' + 10;
72     else
73       CodePoint += Name[i] - 'A' + 10;
74   }
75 
76   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
77   char *ResolvedPtr = Resolved;
78   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
79     return StringRef(Resolved, ResolvedPtr - Resolved);
80   else
81     return StringRef();
82 }
83 
84 void Lexer::skipLineStartingDecorations() {
85   // This function should be called only for C comments
86   assert(CommentState == LCS_InsideCComment);
87 
88   if (BufferPtr == CommentEnd)
89     return;
90 
91   switch (*BufferPtr) {
92   case ' ':
93   case '\t':
94   case '\f':
95   case '\v': {
96     const char *NewBufferPtr = BufferPtr;
97     NewBufferPtr++;
98     if (NewBufferPtr == CommentEnd)
99       return;
100 
101     char C = *NewBufferPtr;
102     while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
103       NewBufferPtr++;
104       if (NewBufferPtr == CommentEnd)
105         return;
106       C = *NewBufferPtr;
107     }
108     if (C == '*')
109       BufferPtr = NewBufferPtr + 1;
110     break;
111   }
112   case '*':
113     BufferPtr++;
114     break;
115   }
116 }
117 
118 namespace {
119 /// Returns pointer to the first newline character in the string.
120 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
121   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
122     const char C = *BufferPtr;
123     if (C == '\n' || C == '\r')
124       return BufferPtr;
125   }
126   return BufferEnd;
127 }
128 
129 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
130   if (BufferPtr == BufferEnd)
131     return BufferPtr;
132 
133   if (*BufferPtr == '\n')
134     BufferPtr++;
135   else {
136     assert(*BufferPtr == '\r');
137     BufferPtr++;
138     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
139       BufferPtr++;
140   }
141   return BufferPtr;
142 }
143 
144 const char *skipNamedCharacterReference(const char *BufferPtr,
145                                         const char *BufferEnd) {
146   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
147     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
148       return BufferPtr;
149   }
150   return BufferEnd;
151 }
152 
153 const char *skipDecimalCharacterReference(const char *BufferPtr,
154                                           const char *BufferEnd) {
155   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
156     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
157       return BufferPtr;
158   }
159   return BufferEnd;
160 }
161 
162 const char *skipHexCharacterReference(const char *BufferPtr,
163                                           const char *BufferEnd) {
164   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
165     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
166       return BufferPtr;
167   }
168   return BufferEnd;
169 }
170 
171 bool isHTMLIdentifierStartingCharacter(char C) {
172   return (C >= 'a' && C <= 'z') ||
173          (C >= 'A' && C <= 'Z');
174 }
175 
176 bool isHTMLIdentifierCharacter(char C) {
177   return (C >= 'a' && C <= 'z') ||
178          (C >= 'A' && C <= 'Z') ||
179          (C >= '0' && C <= '9');
180 }
181 
182 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
183   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184     if (!isHTMLIdentifierCharacter(*BufferPtr))
185       return BufferPtr;
186   }
187   return BufferEnd;
188 }
189 
190 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
191 /// string allowed.
192 ///
193 /// Returns pointer to closing quote.
194 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
195 {
196   const char Quote = *BufferPtr;
197   assert(Quote == '\"' || Quote == '\'');
198 
199   BufferPtr++;
200   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
201     const char C = *BufferPtr;
202     if (C == Quote && BufferPtr[-1] != '\\')
203       return BufferPtr;
204   }
205   return BufferEnd;
206 }
207 
208 bool isHorizontalWhitespace(char C) {
209   return C == ' ' || C == '\t' || C == '\f' || C == '\v';
210 }
211 
212 bool isWhitespace(char C) {
213   return C == ' ' || C == '\n' || C == '\r' ||
214          C == '\t' || C == '\f' || C == '\v';
215 }
216 
217 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
218   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219     if (!isWhitespace(*BufferPtr))
220       return BufferPtr;
221   }
222   return BufferEnd;
223 }
224 
225 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
226   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
227 }
228 
229 bool isCommandNameStartCharacter(char C) {
230   return (C >= 'a' && C <= 'z') ||
231          (C >= 'A' && C <= 'Z');
232 }
233 
234 bool isCommandNameCharacter(char C) {
235   return (C >= 'a' && C <= 'z') ||
236          (C >= 'A' && C <= 'Z') ||
237          (C >= '0' && C <= '9');
238 }
239 
240 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
241   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
242     if (!isCommandNameCharacter(*BufferPtr))
243       return BufferPtr;
244   }
245   return BufferEnd;
246 }
247 
248 /// Return the one past end pointer for BCPL comments.
249 /// Handles newlines escaped with backslash or trigraph for backslahs.
250 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
251   const char *CurPtr = BufferPtr;
252   while (CurPtr != BufferEnd) {
253     char C = *CurPtr;
254     while (C != '\n' && C != '\r') {
255       CurPtr++;
256       if (CurPtr == BufferEnd)
257         return BufferEnd;
258       C = *CurPtr;
259     }
260     // We found a newline, check if it is escaped.
261     const char *EscapePtr = CurPtr - 1;
262     while(isHorizontalWhitespace(*EscapePtr))
263       EscapePtr--;
264 
265     if (*EscapePtr == '\\' ||
266         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
267          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
268       // We found an escaped newline.
269       CurPtr = skipNewline(CurPtr, BufferEnd);
270     } else
271       return CurPtr; // Not an escaped newline.
272   }
273   return BufferEnd;
274 }
275 
276 /// Return the one past end pointer for C comments.
277 /// Very dumb, does not handle escaped newlines or trigraphs.
278 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
279   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
280     if (*BufferPtr == '*') {
281       assert(BufferPtr + 1 != BufferEnd);
282       if (*(BufferPtr + 1) == '/')
283         return BufferPtr;
284     }
285   }
286   llvm_unreachable("buffer end hit before '*/' was seen");
287 }
288 } // unnamed namespace
289 
290 void Lexer::lexCommentText(Token &T) {
291   assert(CommentState == LCS_InsideBCPLComment ||
292          CommentState == LCS_InsideCComment);
293 
294   switch (State) {
295   case LS_Normal:
296     break;
297   case LS_VerbatimBlockFirstLine:
298     lexVerbatimBlockFirstLine(T);
299     return;
300   case LS_VerbatimBlockBody:
301     lexVerbatimBlockBody(T);
302     return;
303   case LS_VerbatimLineText:
304     lexVerbatimLineText(T);
305     return;
306   case LS_HTMLStartTag:
307     lexHTMLStartTag(T);
308     return;
309   case LS_HTMLEndTag:
310     lexHTMLEndTag(T);
311     return;
312   }
313 
314   assert(State == LS_Normal);
315 
316   const char *TokenPtr = BufferPtr;
317   assert(TokenPtr < CommentEnd);
318   while (TokenPtr != CommentEnd) {
319     switch(*TokenPtr) {
320       case '\\':
321       case '@': {
322         TokenPtr++;
323         if (TokenPtr == CommentEnd) {
324           formTextToken(T, TokenPtr);
325           return;
326         }
327         char C = *TokenPtr;
328         switch (C) {
329         default:
330           break;
331 
332         case '\\': case '@': case '&': case '$':
333         case '#':  case '<': case '>': case '%':
334         case '\"': case '.': case ':':
335           // This is one of \\ \@ \& \$ etc escape sequences.
336           TokenPtr++;
337           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
338             // This is the \:: escape sequence.
339             TokenPtr++;
340           }
341           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
342           formTokenWithChars(T, TokenPtr, tok::text);
343           T.setText(UnescapedText);
344           return;
345         }
346 
347         // Don't make zero-length commands.
348         if (!isCommandNameStartCharacter(*TokenPtr)) {
349           formTextToken(T, TokenPtr);
350           return;
351         }
352 
353         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
354         unsigned Length = TokenPtr - (BufferPtr + 1);
355 
356         // Hardcoded support for lexing LaTeX formula commands
357         // \f$ \f[ \f] \f{ \f} as a single command.
358         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
359           C = *TokenPtr;
360           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
361             TokenPtr++;
362             Length++;
363           }
364         }
365 
366         const StringRef CommandName(BufferPtr + 1, Length);
367 
368         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
369         if (!Info) {
370           formTokenWithChars(T, TokenPtr, tok::unknown_command);
371           T.setUnknownCommandName(CommandName);
372           return;
373         }
374         if (Info->IsVerbatimBlockCommand) {
375           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
376           return;
377         }
378         if (Info->IsVerbatimLineCommand) {
379           setupAndLexVerbatimLine(T, TokenPtr, Info);
380           return;
381         }
382         formTokenWithChars(T, TokenPtr, tok::command);
383         T.setCommandID(Info->getID());
384         return;
385       }
386 
387       case '&':
388         lexHTMLCharacterReference(T);
389         return;
390 
391       case '<': {
392         TokenPtr++;
393         if (TokenPtr == CommentEnd) {
394           formTextToken(T, TokenPtr);
395           return;
396         }
397         const char C = *TokenPtr;
398         if (isHTMLIdentifierStartingCharacter(C))
399           setupAndLexHTMLStartTag(T);
400         else if (C == '/')
401           setupAndLexHTMLEndTag(T);
402         else
403           formTextToken(T, TokenPtr);
404 
405         return;
406       }
407 
408       case '\n':
409       case '\r':
410         TokenPtr = skipNewline(TokenPtr, CommentEnd);
411         formTokenWithChars(T, TokenPtr, tok::newline);
412 
413         if (CommentState == LCS_InsideCComment)
414           skipLineStartingDecorations();
415         return;
416 
417       default: {
418         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
419                          find_first_of("\n\r\\@&<");
420         if (End != StringRef::npos)
421           TokenPtr += End;
422         else
423           TokenPtr = CommentEnd;
424         formTextToken(T, TokenPtr);
425         return;
426       }
427     }
428   }
429 }
430 
431 void Lexer::setupAndLexVerbatimBlock(Token &T,
432                                      const char *TextBegin,
433                                      char Marker, const CommandInfo *Info) {
434   assert(Info->IsVerbatimBlockCommand);
435 
436   VerbatimBlockEndCommandName.clear();
437   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
438   VerbatimBlockEndCommandName.append(Info->EndCommandName);
439 
440   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
441   T.setVerbatimBlockID(Info->getID());
442 
443   // If there is a newline following the verbatim opening command, skip the
444   // newline so that we don't create an tok::verbatim_block_line with empty
445   // text content.
446   if (BufferPtr != CommentEnd) {
447     const char C = *BufferPtr;
448     if (C == '\n' || C == '\r') {
449       BufferPtr = skipNewline(BufferPtr, CommentEnd);
450       State = LS_VerbatimBlockBody;
451       return;
452     }
453   }
454 
455   State = LS_VerbatimBlockFirstLine;
456 }
457 
458 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
459 again:
460   assert(BufferPtr < CommentEnd);
461 
462   // FIXME: It would be better to scan the text once, finding either the block
463   // end command or newline.
464   //
465   // Extract current line.
466   const char *Newline = findNewline(BufferPtr, CommentEnd);
467   StringRef Line(BufferPtr, Newline - BufferPtr);
468 
469   // Look for end command in current line.
470   size_t Pos = Line.find(VerbatimBlockEndCommandName);
471   const char *TextEnd;
472   const char *NextLine;
473   if (Pos == StringRef::npos) {
474     // Current line is completely verbatim.
475     TextEnd = Newline;
476     NextLine = skipNewline(Newline, CommentEnd);
477   } else if (Pos == 0) {
478     // Current line contains just an end command.
479     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
480     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
481     formTokenWithChars(T, End, tok::verbatim_block_end);
482     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
483     State = LS_Normal;
484     return;
485   } else {
486     // There is some text, followed by end command.  Extract text first.
487     TextEnd = BufferPtr + Pos;
488     NextLine = TextEnd;
489     // If there is only whitespace before end command, skip whitespace.
490     if (isWhitespace(BufferPtr, TextEnd)) {
491       BufferPtr = TextEnd;
492       goto again;
493     }
494   }
495 
496   StringRef Text(BufferPtr, TextEnd - BufferPtr);
497   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
498   T.setVerbatimBlockText(Text);
499 
500   State = LS_VerbatimBlockBody;
501 }
502 
503 void Lexer::lexVerbatimBlockBody(Token &T) {
504   assert(State == LS_VerbatimBlockBody);
505 
506   if (CommentState == LCS_InsideCComment)
507     skipLineStartingDecorations();
508 
509   lexVerbatimBlockFirstLine(T);
510 }
511 
512 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
513                                     const CommandInfo *Info) {
514   assert(Info->IsVerbatimLineCommand);
515   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
516   T.setVerbatimLineID(Info->getID());
517 
518   State = LS_VerbatimLineText;
519 }
520 
521 void Lexer::lexVerbatimLineText(Token &T) {
522   assert(State == LS_VerbatimLineText);
523 
524   // Extract current line.
525   const char *Newline = findNewline(BufferPtr, CommentEnd);
526   const StringRef Text(BufferPtr, Newline - BufferPtr);
527   formTokenWithChars(T, Newline, tok::verbatim_line_text);
528   T.setVerbatimLineText(Text);
529 
530   State = LS_Normal;
531 }
532 
533 void Lexer::lexHTMLCharacterReference(Token &T) {
534   const char *TokenPtr = BufferPtr;
535   assert(*TokenPtr == '&');
536   TokenPtr++;
537   if (TokenPtr == CommentEnd) {
538     formTextToken(T, TokenPtr);
539     return;
540   }
541   const char *NamePtr;
542   bool isNamed = false;
543   bool isDecimal = false;
544   char C = *TokenPtr;
545   if (isHTMLNamedCharacterReferenceCharacter(C)) {
546     NamePtr = TokenPtr;
547     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
548     isNamed = true;
549   } else if (C == '#') {
550     TokenPtr++;
551     if (TokenPtr == CommentEnd) {
552       formTextToken(T, TokenPtr);
553       return;
554     }
555     C = *TokenPtr;
556     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
557       NamePtr = TokenPtr;
558       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
559       isDecimal = true;
560     } else if (C == 'x' || C == 'X') {
561       TokenPtr++;
562       NamePtr = TokenPtr;
563       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
564     } else {
565       formTextToken(T, TokenPtr);
566       return;
567     }
568   } else {
569     formTextToken(T, TokenPtr);
570     return;
571   }
572   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
573       *TokenPtr != ';') {
574     formTextToken(T, TokenPtr);
575     return;
576   }
577   StringRef Name(NamePtr, TokenPtr - NamePtr);
578   TokenPtr++; // Skip semicolon.
579   StringRef Resolved;
580   if (isNamed)
581     Resolved = resolveHTMLNamedCharacterReference(Name);
582   else if (isDecimal)
583     Resolved = resolveHTMLDecimalCharacterReference(Name);
584   else
585     Resolved = resolveHTMLHexCharacterReference(Name);
586 
587   if (Resolved.empty()) {
588     formTextToken(T, TokenPtr);
589     return;
590   }
591   formTokenWithChars(T, TokenPtr, tok::text);
592   T.setText(Resolved);
593   return;
594 }
595 
596 void Lexer::setupAndLexHTMLStartTag(Token &T) {
597   assert(BufferPtr[0] == '<' &&
598          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
599   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
600   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
601   if (!isHTMLTagName(Name)) {
602     formTextToken(T, TagNameEnd);
603     return;
604   }
605 
606   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
607   T.setHTMLTagStartName(Name);
608 
609   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
610 
611   const char C = *BufferPtr;
612   if (BufferPtr != CommentEnd &&
613       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
614     State = LS_HTMLStartTag;
615 }
616 
617 void Lexer::lexHTMLStartTag(Token &T) {
618   assert(State == LS_HTMLStartTag);
619 
620   const char *TokenPtr = BufferPtr;
621   char C = *TokenPtr;
622   if (isHTMLIdentifierCharacter(C)) {
623     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
624     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
625     formTokenWithChars(T, TokenPtr, tok::html_ident);
626     T.setHTMLIdent(Ident);
627   } else {
628     switch (C) {
629     case '=':
630       TokenPtr++;
631       formTokenWithChars(T, TokenPtr, tok::html_equals);
632       break;
633     case '\"':
634     case '\'': {
635       const char *OpenQuote = TokenPtr;
636       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
637       const char *ClosingQuote = TokenPtr;
638       if (TokenPtr != CommentEnd) // Skip closing quote.
639         TokenPtr++;
640       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
641       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
642                                       ClosingQuote - (OpenQuote + 1)));
643       break;
644     }
645     case '>':
646       TokenPtr++;
647       formTokenWithChars(T, TokenPtr, tok::html_greater);
648       State = LS_Normal;
649       return;
650     case '/':
651       TokenPtr++;
652       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
653         TokenPtr++;
654         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
655       } else
656         formTextToken(T, TokenPtr);
657 
658       State = LS_Normal;
659       return;
660     }
661   }
662 
663   // Now look ahead and return to normal state if we don't see any HTML tokens
664   // ahead.
665   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
666   if (BufferPtr == CommentEnd) {
667     State = LS_Normal;
668     return;
669   }
670 
671   C = *BufferPtr;
672   if (!isHTMLIdentifierStartingCharacter(C) &&
673       C != '=' && C != '\"' && C != '\'' && C != '>') {
674     State = LS_Normal;
675     return;
676   }
677 }
678 
679 void Lexer::setupAndLexHTMLEndTag(Token &T) {
680   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
681 
682   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
683   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
684   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
685   if (!isHTMLTagName(Name)) {
686     formTextToken(T, TagNameEnd);
687     return;
688   }
689 
690   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
691 
692   formTokenWithChars(T, End, tok::html_end_tag);
693   T.setHTMLTagEndName(Name);
694 
695   if (BufferPtr != CommentEnd && *BufferPtr == '>')
696     State = LS_HTMLEndTag;
697 }
698 
699 void Lexer::lexHTMLEndTag(Token &T) {
700   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
701 
702   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
703   State = LS_Normal;
704 }
705 
706 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
707              SourceLocation FileLoc,
708              const char *BufferStart, const char *BufferEnd):
709     Allocator(Allocator), Traits(Traits),
710     BufferStart(BufferStart), BufferEnd(BufferEnd),
711     FileLoc(FileLoc), BufferPtr(BufferStart),
712     CommentState(LCS_BeforeComment), State(LS_Normal) {
713 }
714 
715 void Lexer::lex(Token &T) {
716 again:
717   switch (CommentState) {
718   case LCS_BeforeComment:
719     if (BufferPtr == BufferEnd) {
720       formTokenWithChars(T, BufferPtr, tok::eof);
721       return;
722     }
723 
724     assert(*BufferPtr == '/');
725     BufferPtr++; // Skip first slash.
726     switch(*BufferPtr) {
727     case '/': { // BCPL comment.
728       BufferPtr++; // Skip second slash.
729 
730       if (BufferPtr != BufferEnd) {
731         // Skip Doxygen magic marker, if it is present.
732         // It might be missing because of a typo //< or /*<, or because we
733         // merged this non-Doxygen comment into a bunch of Doxygen comments
734         // around it: /** ... */ /* ... */ /** ... */
735         const char C = *BufferPtr;
736         if (C == '/' || C == '!')
737           BufferPtr++;
738       }
739 
740       // Skip less-than symbol that marks trailing comments.
741       // Skip it even if the comment is not a Doxygen one, because //< and /*<
742       // are frequent typos.
743       if (BufferPtr != BufferEnd && *BufferPtr == '<')
744         BufferPtr++;
745 
746       CommentState = LCS_InsideBCPLComment;
747       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
748         State = LS_Normal;
749       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
750       goto again;
751     }
752     case '*': { // C comment.
753       BufferPtr++; // Skip star.
754 
755       // Skip Doxygen magic marker.
756       const char C = *BufferPtr;
757       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
758         BufferPtr++;
759 
760       // Skip less-than symbol that marks trailing comments.
761       if (BufferPtr != BufferEnd && *BufferPtr == '<')
762         BufferPtr++;
763 
764       CommentState = LCS_InsideCComment;
765       State = LS_Normal;
766       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
767       goto again;
768     }
769     default:
770       llvm_unreachable("second character of comment should be '/' or '*'");
771     }
772 
773   case LCS_BetweenComments: {
774     // Consecutive comments are extracted only if there is only whitespace
775     // between them.  So we can search for the start of the next comment.
776     const char *EndWhitespace = BufferPtr;
777     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
778       EndWhitespace++;
779 
780     // Turn any whitespace between comments (and there is only whitespace
781     // between them -- guaranteed by comment extraction) into a newline.  We
782     // have two newlines between C comments in total (first one was synthesized
783     // after a comment).
784     formTokenWithChars(T, EndWhitespace, tok::newline);
785 
786     CommentState = LCS_BeforeComment;
787     break;
788   }
789 
790   case LCS_InsideBCPLComment:
791   case LCS_InsideCComment:
792     if (BufferPtr != CommentEnd) {
793       lexCommentText(T);
794       break;
795     } else {
796       // Skip C comment closing sequence.
797       if (CommentState == LCS_InsideCComment) {
798         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
799         BufferPtr += 2;
800         assert(BufferPtr <= BufferEnd);
801 
802         // Synthenize newline just after the C comment, regardless if there is
803         // actually a newline.
804         formTokenWithChars(T, BufferPtr, tok::newline);
805 
806         CommentState = LCS_BetweenComments;
807         break;
808       } else {
809         // Don't synthesized a newline after BCPL comment.
810         CommentState = LCS_BetweenComments;
811         goto again;
812       }
813     }
814   }
815 }
816 
817 StringRef Lexer::getSpelling(const Token &Tok,
818                              const SourceManager &SourceMgr,
819                              bool *Invalid) const {
820   SourceLocation Loc = Tok.getLocation();
821   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
822 
823   bool InvalidTemp = false;
824   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
825   if (InvalidTemp) {
826     *Invalid = true;
827     return StringRef();
828   }
829 
830   const char *Begin = File.data() + LocInfo.second;
831   return StringRef(Begin, Tok.getLength());
832 }
833 
834 } // end namespace comments
835 } // end namespace clang
836 
837