1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
25 FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
26                                    unsigned Column, const FormatStyle &Style,
27                                    encoding::Encoding Encoding)
28     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
29       Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
30       Style(Style), IdentTable(getFormattingLangOpts(Style)),
31       Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33       MacroBlockEndRegex(Style.MacroBlockEnd) {
34   Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
35                       getFormattingLangOpts(Style)));
36   Lex->SetKeepWhitespaceMode(true);
37 
38   for (const std::string &ForEachMacro : Style.ForEachMacros)
39     Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
40   for (const std::string &StatementMacro : Style.StatementMacros)
41     Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
42   for (const std::string &TypenameMacro : Style.TypenameMacros)
43     Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
44   for (const std::string &NamespaceMacro : Style.NamespaceMacros)
45     Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
46 }
47 
48 ArrayRef<FormatToken *> FormatTokenLexer::lex() {
49   assert(Tokens.empty());
50   assert(FirstInLineIndex == 0);
51   do {
52     Tokens.push_back(getNextToken());
53     if (Style.Language == FormatStyle::LK_JavaScript) {
54       tryParseJSRegexLiteral();
55       handleTemplateStrings();
56     }
57     if (Style.Language == FormatStyle::LK_TextProto)
58       tryParsePythonComment();
59     tryMergePreviousTokens();
60     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
61       FirstInLineIndex = Tokens.size() - 1;
62   } while (Tokens.back()->Tok.isNot(tok::eof));
63   return Tokens;
64 }
65 
66 void FormatTokenLexer::tryMergePreviousTokens() {
67   if (tryMerge_TMacro())
68     return;
69   if (tryMergeConflictMarkers())
70     return;
71   if (tryMergeLessLess())
72     return;
73 
74   if (Style.isCSharp()) {
75     if (tryMergeCSharpKeywordVariables())
76       return;
77     if (tryMergeCSharpVerbatimStringLiteral())
78       return;
79     if (tryMergeCSharpDoubleQuestion())
80       return;
81     if (tryMergeCSharpNullConditionals())
82       return;
83     if (tryTransformCSharpForEach())
84       return;
85     static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
86     if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
87       return;
88   }
89 
90   if (tryMergeNSStringLiteral())
91     return;
92 
93   if (Style.Language == FormatStyle::LK_JavaScript) {
94     static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
95     static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
96                                                    tok::equal};
97     static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
98                                                   tok::greaterequal};
99     static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
100     static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
101     static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
102                                                            tok::starequal};
103     static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
104                                                                tok::period};
105     static const tok::TokenKind JSNullishOperator[] = {tok::question,
106                                                        tok::question};
107 
108     // FIXME: Investigate what token type gives the correct operator priority.
109     if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
110       return;
111     if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
112       return;
113     if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
114       return;
115     if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
116       return;
117     if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
118       return;
119     if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
120       Tokens.back()->Tok.setKind(tok::starequal);
121       return;
122     }
123     if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
124       // Treat like the "||" operator (as opposed to the ternary ?).
125       Tokens.back()->Tok.setKind(tok::pipepipe);
126       return;
127     }
128     if (tryMergeTokens(JSNullPropagatingOperator,
129                        TT_JsNullPropagatingOperator)) {
130       // Treat like a regular "." access.
131       Tokens.back()->Tok.setKind(tok::period);
132       return;
133     }
134     if (tryMergeJSPrivateIdentifier())
135       return;
136   }
137 
138   if (Style.Language == FormatStyle::LK_Java) {
139     static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
140         tok::greater, tok::greater, tok::greaterequal};
141     if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
142       return;
143   }
144 }
145 
146 bool FormatTokenLexer::tryMergeNSStringLiteral() {
147   if (Tokens.size() < 2)
148     return false;
149   auto &At = *(Tokens.end() - 2);
150   auto &String = *(Tokens.end() - 1);
151   if (!At->is(tok::at) || !String->is(tok::string_literal))
152     return false;
153   At->Tok.setKind(tok::string_literal);
154   At->TokenText = StringRef(At->TokenText.begin(),
155                             String->TokenText.end() - At->TokenText.begin());
156   At->ColumnWidth += String->ColumnWidth;
157   At->Type = TT_ObjCStringLiteral;
158   Tokens.erase(Tokens.end() - 1);
159   return true;
160 }
161 
162 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
163   // Merges #idenfier into a single identifier with the text #identifier
164   // but the token tok::identifier.
165   if (Tokens.size() < 2)
166     return false;
167   auto &Hash = *(Tokens.end() - 2);
168   auto &Identifier = *(Tokens.end() - 1);
169   if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
170     return false;
171   Hash->Tok.setKind(tok::identifier);
172   Hash->TokenText =
173       StringRef(Hash->TokenText.begin(),
174                 Identifier->TokenText.end() - Hash->TokenText.begin());
175   Hash->ColumnWidth += Identifier->ColumnWidth;
176   Hash->Type = TT_JsPrivateIdentifier;
177   Tokens.erase(Tokens.end() - 1);
178   return true;
179 }
180 
181 // Search for verbatim or interpolated string literals @"ABC" or
182 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
183 // prevent splitting of @, $ and ".
184 bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
185   if (Tokens.size() < 2)
186     return false;
187   auto &At = *(Tokens.end() - 2);
188   auto &String = *(Tokens.end() - 1);
189 
190   // Look for $"aaaaaa" @"aaaaaa".
191   if (!(At->is(tok::at) || At->TokenText == "$") ||
192       !String->is(tok::string_literal))
193     return false;
194 
195   if (Tokens.size() >= 2 && At->is(tok::at)) {
196     auto &Dollar = *(Tokens.end() - 3);
197     if (Dollar->TokenText == "$") {
198       // This looks like $@"aaaaa" so we need to combine all 3 tokens.
199       Dollar->Tok.setKind(tok::string_literal);
200       Dollar->TokenText =
201           StringRef(Dollar->TokenText.begin(),
202                     String->TokenText.end() - Dollar->TokenText.begin());
203       Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
204       Dollar->Type = TT_CSharpStringLiteral;
205       Tokens.erase(Tokens.end() - 2);
206       Tokens.erase(Tokens.end() - 1);
207       return true;
208     }
209   }
210 
211   // Convert back into just a string_literal.
212   At->Tok.setKind(tok::string_literal);
213   At->TokenText = StringRef(At->TokenText.begin(),
214                             String->TokenText.end() - At->TokenText.begin());
215   At->ColumnWidth += String->ColumnWidth;
216   At->Type = TT_CSharpStringLiteral;
217   Tokens.erase(Tokens.end() - 1);
218   return true;
219 }
220 
221 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
222   if (Tokens.size() < 2)
223     return false;
224   auto &FirstQuestion = *(Tokens.end() - 2);
225   auto &SecondQuestion = *(Tokens.end() - 1);
226   if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
227     return false;
228   FirstQuestion->Tok.setKind(tok::question);
229   FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
230                                        SecondQuestion->TokenText.end() -
231                                            FirstQuestion->TokenText.begin());
232   FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
233   FirstQuestion->Type = TT_CSharpNullCoalescing;
234   Tokens.erase(Tokens.end() - 1);
235   return true;
236 }
237 
238 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
239   if (Tokens.size() < 2)
240     return false;
241   auto &At = *(Tokens.end() - 2);
242   auto &Keyword = *(Tokens.end() - 1);
243   if (!At->is(tok::at))
244     return false;
245   if (!Keywords.isCSharpKeyword(*Keyword))
246     return false;
247 
248   At->Tok.setKind(tok::identifier);
249   At->TokenText = StringRef(At->TokenText.begin(),
250                             Keyword->TokenText.end() - At->TokenText.begin());
251   At->ColumnWidth += Keyword->ColumnWidth;
252   At->Type = Keyword->Type;
253   Tokens.erase(Tokens.end() - 1);
254   return true;
255 }
256 
257 // In C# merge the Identifier and the ? together e.g. arg?.
258 bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
259   if (Tokens.size() < 2)
260     return false;
261   auto &Identifier = *(Tokens.end() - 2);
262   auto &Question = *(Tokens.end() - 1);
263   if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
264       !Question->is(tok::question))
265     return false;
266   Identifier->TokenText =
267       StringRef(Identifier->TokenText.begin(),
268                 Question->TokenText.end() - Identifier->TokenText.begin());
269   Identifier->ColumnWidth += Question->ColumnWidth;
270   Tokens.erase(Tokens.end() - 1);
271   return true;
272 }
273 
274 // In C# transform identifier foreach into kw_foreach
275 bool FormatTokenLexer::tryTransformCSharpForEach() {
276   if (Tokens.size() < 1)
277     return false;
278   auto &Identifier = *(Tokens.end() - 1);
279   if (!Identifier->is(tok::identifier))
280     return false;
281   if (Identifier->TokenText != "foreach")
282     return false;
283 
284   Identifier->Type = TT_ForEachMacro;
285   Identifier->Tok.setKind(tok::kw_for);
286   return true;
287 }
288 
289 bool FormatTokenLexer::tryMergeLessLess() {
290   // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
291   if (Tokens.size() < 3)
292     return false;
293 
294   bool FourthTokenIsLess = false;
295   if (Tokens.size() > 3)
296     FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
297 
298   auto First = Tokens.end() - 3;
299   if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
300       First[0]->isNot(tok::less) || FourthTokenIsLess)
301     return false;
302 
303   // Only merge if there currently is no whitespace between the two "<".
304   if (First[1]->WhitespaceRange.getBegin() !=
305       First[1]->WhitespaceRange.getEnd())
306     return false;
307 
308   First[0]->Tok.setKind(tok::lessless);
309   First[0]->TokenText = "<<";
310   First[0]->ColumnWidth += 1;
311   Tokens.erase(Tokens.end() - 2);
312   return true;
313 }
314 
315 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
316                                       TokenType NewType) {
317   if (Tokens.size() < Kinds.size())
318     return false;
319 
320   SmallVectorImpl<FormatToken *>::const_iterator First =
321       Tokens.end() - Kinds.size();
322   if (!First[0]->is(Kinds[0]))
323     return false;
324   unsigned AddLength = 0;
325   for (unsigned i = 1; i < Kinds.size(); ++i) {
326     if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
327                                        First[i]->WhitespaceRange.getEnd())
328       return false;
329     AddLength += First[i]->TokenText.size();
330   }
331   Tokens.resize(Tokens.size() - Kinds.size() + 1);
332   First[0]->TokenText = StringRef(First[0]->TokenText.data(),
333                                   First[0]->TokenText.size() + AddLength);
334   First[0]->ColumnWidth += AddLength;
335   First[0]->Type = NewType;
336   return true;
337 }
338 
339 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
340 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
341   // NB: This is not entirely correct, as an r_paren can introduce an operand
342   // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
343   // corner case to not matter in practice, though.
344   return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
345                       tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
346                       tok::colon, tok::question, tok::tilde) ||
347          Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
348                       tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
349                       tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
350          Tok->isBinaryOperator();
351 }
352 
353 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
354   if (!Prev)
355     return true;
356 
357   // Regex literals can only follow after prefix unary operators, not after
358   // postfix unary operators. If the '++' is followed by a non-operand
359   // introducing token, the slash here is the operand and not the start of a
360   // regex.
361   // `!` is an unary prefix operator, but also a post-fix operator that casts
362   // away nullability, so the same check applies.
363   if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
364     return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
365 
366   // The previous token must introduce an operand location where regex
367   // literals can occur.
368   if (!precedesOperand(Prev))
369     return false;
370 
371   return true;
372 }
373 
374 // Tries to parse a JavaScript Regex literal starting at the current token,
375 // if that begins with a slash and is in a location where JavaScript allows
376 // regex literals. Changes the current token to a regex literal and updates
377 // its text if successful.
378 void FormatTokenLexer::tryParseJSRegexLiteral() {
379   FormatToken *RegexToken = Tokens.back();
380   if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
381     return;
382 
383   FormatToken *Prev = nullptr;
384   for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
385     // NB: Because previous pointers are not initialized yet, this cannot use
386     // Token.getPreviousNonComment.
387     if ((*I)->isNot(tok::comment)) {
388       Prev = *I;
389       break;
390     }
391   }
392 
393   if (!canPrecedeRegexLiteral(Prev))
394     return;
395 
396   // 'Manually' lex ahead in the current file buffer.
397   const char *Offset = Lex->getBufferLocation();
398   const char *RegexBegin = Offset - RegexToken->TokenText.size();
399   StringRef Buffer = Lex->getBuffer();
400   bool InCharacterClass = false;
401   bool HaveClosingSlash = false;
402   for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
403     // Regular expressions are terminated with a '/', which can only be
404     // escaped using '\' or a character class between '[' and ']'.
405     // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
406     switch (*Offset) {
407     case '\\':
408       // Skip the escaped character.
409       ++Offset;
410       break;
411     case '[':
412       InCharacterClass = true;
413       break;
414     case ']':
415       InCharacterClass = false;
416       break;
417     case '/':
418       if (!InCharacterClass)
419         HaveClosingSlash = true;
420       break;
421     }
422   }
423 
424   RegexToken->Type = TT_RegexLiteral;
425   // Treat regex literals like other string_literals.
426   RegexToken->Tok.setKind(tok::string_literal);
427   RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
428   RegexToken->ColumnWidth = RegexToken->TokenText.size();
429 
430   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
431 }
432 
433 void FormatTokenLexer::handleTemplateStrings() {
434   FormatToken *BacktickToken = Tokens.back();
435 
436   if (BacktickToken->is(tok::l_brace)) {
437     StateStack.push(LexerState::NORMAL);
438     return;
439   }
440   if (BacktickToken->is(tok::r_brace)) {
441     if (StateStack.size() == 1)
442       return;
443     StateStack.pop();
444     if (StateStack.top() != LexerState::TEMPLATE_STRING)
445       return;
446     // If back in TEMPLATE_STRING, fallthrough and continue parsing the
447   } else if (BacktickToken->is(tok::unknown) &&
448              BacktickToken->TokenText == "`") {
449     StateStack.push(LexerState::TEMPLATE_STRING);
450   } else {
451     return; // Not actually a template
452   }
453 
454   // 'Manually' lex ahead in the current file buffer.
455   const char *Offset = Lex->getBufferLocation();
456   const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
457   for (; Offset != Lex->getBuffer().end(); ++Offset) {
458     if (Offset[0] == '`') {
459       StateStack.pop();
460       break;
461     }
462     if (Offset[0] == '\\') {
463       ++Offset; // Skip the escaped character.
464     } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
465                Offset[1] == '{') {
466       // '${' introduces an expression interpolation in the template string.
467       StateStack.push(LexerState::NORMAL);
468       ++Offset;
469       break;
470     }
471   }
472 
473   StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
474   BacktickToken->Type = TT_TemplateString;
475   BacktickToken->Tok.setKind(tok::string_literal);
476   BacktickToken->TokenText = LiteralText;
477 
478   // Adjust width for potentially multiline string literals.
479   size_t FirstBreak = LiteralText.find('\n');
480   StringRef FirstLineText = FirstBreak == StringRef::npos
481                                 ? LiteralText
482                                 : LiteralText.substr(0, FirstBreak);
483   BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
484       FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
485   size_t LastBreak = LiteralText.rfind('\n');
486   if (LastBreak != StringRef::npos) {
487     BacktickToken->IsMultiline = true;
488     unsigned StartColumn = 0; // The template tail spans the entire line.
489     BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
490         LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
491         Style.TabWidth, Encoding);
492   }
493 
494   SourceLocation loc = Offset < Lex->getBuffer().end()
495                            ? Lex->getSourceLocation(Offset + 1)
496                            : SourceMgr.getLocForEndOfFile(ID);
497   resetLexer(SourceMgr.getFileOffset(loc));
498 }
499 
500 void FormatTokenLexer::tryParsePythonComment() {
501   FormatToken *HashToken = Tokens.back();
502   if (!HashToken->isOneOf(tok::hash, tok::hashhash))
503     return;
504   // Turn the remainder of this line into a comment.
505   const char *CommentBegin =
506       Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
507   size_t From = CommentBegin - Lex->getBuffer().begin();
508   size_t To = Lex->getBuffer().find_first_of('\n', From);
509   if (To == StringRef::npos)
510     To = Lex->getBuffer().size();
511   size_t Len = To - From;
512   HashToken->Type = TT_LineComment;
513   HashToken->Tok.setKind(tok::comment);
514   HashToken->TokenText = Lex->getBuffer().substr(From, Len);
515   SourceLocation Loc = To < Lex->getBuffer().size()
516                            ? Lex->getSourceLocation(CommentBegin + Len)
517                            : SourceMgr.getLocForEndOfFile(ID);
518   resetLexer(SourceMgr.getFileOffset(Loc));
519 }
520 
521 bool FormatTokenLexer::tryMerge_TMacro() {
522   if (Tokens.size() < 4)
523     return false;
524   FormatToken *Last = Tokens.back();
525   if (!Last->is(tok::r_paren))
526     return false;
527 
528   FormatToken *String = Tokens[Tokens.size() - 2];
529   if (!String->is(tok::string_literal) || String->IsMultiline)
530     return false;
531 
532   if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
533     return false;
534 
535   FormatToken *Macro = Tokens[Tokens.size() - 4];
536   if (Macro->TokenText != "_T")
537     return false;
538 
539   const char *Start = Macro->TokenText.data();
540   const char *End = Last->TokenText.data() + Last->TokenText.size();
541   String->TokenText = StringRef(Start, End - Start);
542   String->IsFirst = Macro->IsFirst;
543   String->LastNewlineOffset = Macro->LastNewlineOffset;
544   String->WhitespaceRange = Macro->WhitespaceRange;
545   String->OriginalColumn = Macro->OriginalColumn;
546   String->ColumnWidth = encoding::columnWidthWithTabs(
547       String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
548   String->NewlinesBefore = Macro->NewlinesBefore;
549   String->HasUnescapedNewline = Macro->HasUnescapedNewline;
550 
551   Tokens.pop_back();
552   Tokens.pop_back();
553   Tokens.pop_back();
554   Tokens.back() = String;
555   return true;
556 }
557 
558 bool FormatTokenLexer::tryMergeConflictMarkers() {
559   if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
560     return false;
561 
562   // Conflict lines look like:
563   // <marker> <text from the vcs>
564   // For example:
565   // >>>>>>> /file/in/file/system at revision 1234
566   //
567   // We merge all tokens in a line that starts with a conflict marker
568   // into a single token with a special token type that the unwrapped line
569   // parser will use to correctly rebuild the underlying code.
570 
571   FileID ID;
572   // Get the position of the first token in the line.
573   unsigned FirstInLineOffset;
574   std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
575       Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
576   StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
577   // Calculate the offset of the start of the current line.
578   auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
579   if (LineOffset == StringRef::npos) {
580     LineOffset = 0;
581   } else {
582     ++LineOffset;
583   }
584 
585   auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
586   StringRef LineStart;
587   if (FirstSpace == StringRef::npos) {
588     LineStart = Buffer.substr(LineOffset);
589   } else {
590     LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
591   }
592 
593   TokenType Type = TT_Unknown;
594   if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
595     Type = TT_ConflictStart;
596   } else if (LineStart == "|||||||" || LineStart == "=======" ||
597              LineStart == "====") {
598     Type = TT_ConflictAlternative;
599   } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
600     Type = TT_ConflictEnd;
601   }
602 
603   if (Type != TT_Unknown) {
604     FormatToken *Next = Tokens.back();
605 
606     Tokens.resize(FirstInLineIndex + 1);
607     // We do not need to build a complete token here, as we will skip it
608     // during parsing anyway (as we must not touch whitespace around conflict
609     // markers).
610     Tokens.back()->Type = Type;
611     Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
612 
613     Tokens.push_back(Next);
614     return true;
615   }
616 
617   return false;
618 }
619 
620 FormatToken *FormatTokenLexer::getStashedToken() {
621   // Create a synthesized second '>' or '<' token.
622   Token Tok = FormatTok->Tok;
623   StringRef TokenText = FormatTok->TokenText;
624 
625   unsigned OriginalColumn = FormatTok->OriginalColumn;
626   FormatTok = new (Allocator.Allocate()) FormatToken;
627   FormatTok->Tok = Tok;
628   SourceLocation TokLocation =
629       FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
630   FormatTok->Tok.setLocation(TokLocation);
631   FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
632   FormatTok->TokenText = TokenText;
633   FormatTok->ColumnWidth = 1;
634   FormatTok->OriginalColumn = OriginalColumn + 1;
635 
636   return FormatTok;
637 }
638 
639 FormatToken *FormatTokenLexer::getNextToken() {
640   if (StateStack.top() == LexerState::TOKEN_STASHED) {
641     StateStack.pop();
642     return getStashedToken();
643   }
644 
645   FormatTok = new (Allocator.Allocate()) FormatToken;
646   readRawToken(*FormatTok);
647   SourceLocation WhitespaceStart =
648       FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
649   FormatTok->IsFirst = IsFirstToken;
650   IsFirstToken = false;
651 
652   // Consume and record whitespace until we find a significant token.
653   unsigned WhitespaceLength = TrailingWhitespace;
654   while (FormatTok->Tok.is(tok::unknown)) {
655     StringRef Text = FormatTok->TokenText;
656     auto EscapesNewline = [&](int pos) {
657       // A '\r' here is just part of '\r\n'. Skip it.
658       if (pos >= 0 && Text[pos] == '\r')
659         --pos;
660       // See whether there is an odd number of '\' before this.
661       // FIXME: This is wrong. A '\' followed by a newline is always removed,
662       // regardless of whether there is another '\' before it.
663       // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
664       unsigned count = 0;
665       for (; pos >= 0; --pos, ++count)
666         if (Text[pos] != '\\')
667           break;
668       return count & 1;
669     };
670     // FIXME: This miscounts tok:unknown tokens that are not just
671     // whitespace, e.g. a '`' character.
672     for (int i = 0, e = Text.size(); i != e; ++i) {
673       switch (Text[i]) {
674       case '\n':
675         ++FormatTok->NewlinesBefore;
676         FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
677         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
678         Column = 0;
679         break;
680       case '\r':
681         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
682         Column = 0;
683         break;
684       case '\f':
685       case '\v':
686         Column = 0;
687         break;
688       case ' ':
689         ++Column;
690         break;
691       case '\t':
692         Column +=
693             Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
694         break;
695       case '\\':
696         if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
697           FormatTok->Type = TT_ImplicitStringLiteral;
698         break;
699       default:
700         FormatTok->Type = TT_ImplicitStringLiteral;
701         break;
702       }
703       if (FormatTok->Type == TT_ImplicitStringLiteral)
704         break;
705     }
706 
707     if (FormatTok->is(TT_ImplicitStringLiteral))
708       break;
709     WhitespaceLength += FormatTok->Tok.getLength();
710 
711     readRawToken(*FormatTok);
712   }
713 
714   // JavaScript and Java do not allow to escape the end of the line with a
715   // backslash. Backslashes are syntax errors in plain source, but can occur in
716   // comments. When a single line comment ends with a \, it'll cause the next
717   // line of code to be lexed as a comment, breaking formatting. The code below
718   // finds comments that contain a backslash followed by a line break, truncates
719   // the comment token at the backslash, and resets the lexer to restart behind
720   // the backslash.
721   if ((Style.Language == FormatStyle::LK_JavaScript ||
722        Style.Language == FormatStyle::LK_Java) &&
723       FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
724     size_t BackslashPos = FormatTok->TokenText.find('\\');
725     while (BackslashPos != StringRef::npos) {
726       if (BackslashPos + 1 < FormatTok->TokenText.size() &&
727           FormatTok->TokenText[BackslashPos + 1] == '\n') {
728         const char *Offset = Lex->getBufferLocation();
729         Offset -= FormatTok->TokenText.size();
730         Offset += BackslashPos + 1;
731         resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
732         FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
733         FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
734             FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
735             Encoding);
736         break;
737       }
738       BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
739     }
740   }
741 
742   // In case the token starts with escaped newlines, we want to
743   // take them into account as whitespace - this pattern is quite frequent
744   // in macro definitions.
745   // FIXME: Add a more explicit test.
746   while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
747     unsigned SkippedWhitespace = 0;
748     if (FormatTok->TokenText.size() > 2 &&
749         (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
750       SkippedWhitespace = 3;
751     else if (FormatTok->TokenText[1] == '\n')
752       SkippedWhitespace = 2;
753     else
754       break;
755 
756     ++FormatTok->NewlinesBefore;
757     WhitespaceLength += SkippedWhitespace;
758     FormatTok->LastNewlineOffset = SkippedWhitespace;
759     Column = 0;
760     FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
761   }
762 
763   FormatTok->WhitespaceRange = SourceRange(
764       WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
765 
766   FormatTok->OriginalColumn = Column;
767 
768   TrailingWhitespace = 0;
769   if (FormatTok->Tok.is(tok::comment)) {
770     // FIXME: Add the trimmed whitespace to Column.
771     StringRef UntrimmedText = FormatTok->TokenText;
772     FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
773     TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
774   } else if (FormatTok->Tok.is(tok::raw_identifier)) {
775     IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
776     FormatTok->Tok.setIdentifierInfo(&Info);
777     FormatTok->Tok.setKind(Info.getTokenID());
778     if (Style.Language == FormatStyle::LK_Java &&
779         FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
780                            tok::kw_operator)) {
781       FormatTok->Tok.setKind(tok::identifier);
782       FormatTok->Tok.setIdentifierInfo(nullptr);
783     } else if (Style.Language == FormatStyle::LK_JavaScript &&
784                FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
785                                   tok::kw_operator)) {
786       FormatTok->Tok.setKind(tok::identifier);
787       FormatTok->Tok.setIdentifierInfo(nullptr);
788     }
789   } else if (FormatTok->Tok.is(tok::greatergreater)) {
790     FormatTok->Tok.setKind(tok::greater);
791     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
792     ++Column;
793     StateStack.push(LexerState::TOKEN_STASHED);
794   } else if (FormatTok->Tok.is(tok::lessless)) {
795     FormatTok->Tok.setKind(tok::less);
796     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
797     ++Column;
798     StateStack.push(LexerState::TOKEN_STASHED);
799   }
800 
801   // Now FormatTok is the next non-whitespace token.
802 
803   StringRef Text = FormatTok->TokenText;
804   size_t FirstNewlinePos = Text.find('\n');
805   if (FirstNewlinePos == StringRef::npos) {
806     // FIXME: ColumnWidth actually depends on the start column, we need to
807     // take this into account when the token is moved.
808     FormatTok->ColumnWidth =
809         encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
810     Column += FormatTok->ColumnWidth;
811   } else {
812     FormatTok->IsMultiline = true;
813     // FIXME: ColumnWidth actually depends on the start column, we need to
814     // take this into account when the token is moved.
815     FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
816         Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
817 
818     // The last line of the token always starts in column 0.
819     // Thus, the length can be precomputed even in the presence of tabs.
820     FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
821         Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
822     Column = FormatTok->LastLineColumnWidth;
823   }
824 
825   if (Style.isCpp()) {
826     auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
827     if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
828           Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
829               tok::pp_define) &&
830         it != Macros.end()) {
831       FormatTok->Type = it->second;
832     } else if (FormatTok->is(tok::identifier)) {
833       if (MacroBlockBeginRegex.match(Text)) {
834         FormatTok->Type = TT_MacroBlockBegin;
835       } else if (MacroBlockEndRegex.match(Text)) {
836         FormatTok->Type = TT_MacroBlockEnd;
837       }
838     }
839   }
840 
841   return FormatTok;
842 }
843 
844 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
845   Lex->LexFromRawLexer(Tok.Tok);
846   Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
847                             Tok.Tok.getLength());
848   // For formatting, treat unterminated string literals like normal string
849   // literals.
850   if (Tok.is(tok::unknown)) {
851     if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
852       Tok.Tok.setKind(tok::string_literal);
853       Tok.IsUnterminatedLiteral = true;
854     } else if (Style.Language == FormatStyle::LK_JavaScript &&
855                Tok.TokenText == "''") {
856       Tok.Tok.setKind(tok::string_literal);
857     }
858   }
859 
860   if ((Style.Language == FormatStyle::LK_JavaScript ||
861        Style.Language == FormatStyle::LK_Proto ||
862        Style.Language == FormatStyle::LK_TextProto) &&
863       Tok.is(tok::char_constant)) {
864     Tok.Tok.setKind(tok::string_literal);
865   }
866 
867   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
868                                Tok.TokenText == "/* clang-format on */")) {
869     FormattingDisabled = false;
870   }
871 
872   Tok.Finalized = FormattingDisabled;
873 
874   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
875                                Tok.TokenText == "/* clang-format off */")) {
876     FormattingDisabled = true;
877   }
878 }
879 
880 void FormatTokenLexer::resetLexer(unsigned Offset) {
881   StringRef Buffer = SourceMgr.getBufferData(ID);
882   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
883                       getFormattingLangOpts(Style), Buffer.begin(),
884                       Buffer.begin() + Offset, Buffer.end()));
885   Lex->SetKeepWhitespaceMode(true);
886   TrailingWhitespace = 0;
887 }
888 
889 } // namespace format
890 } // namespace clang
891