1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief This file implements FormatTokenLexer, which tokenizes a source file
12 /// into a FormatToken stream suitable for ClangFormat.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "FormatTokenLexer.h"
17 #include "FormatToken.h"
18 #include "clang/Basic/SourceLocation.h"
19 #include "clang/Basic/SourceManager.h"
20 #include "clang/Format/Format.h"
21 #include "llvm/Support/Regex.h"
22 
23 namespace clang {
24 namespace format {
25 
26 FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
27                                    const FormatStyle &Style,
28                                    encoding::Encoding Encoding)
29     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
30       Column(0), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
31       Style(Style), IdentTable(getFormattingLangOpts(Style)),
32       Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
33       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
34       MacroBlockEndRegex(Style.MacroBlockEnd) {
35   Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
36                       getFormattingLangOpts(Style)));
37   Lex->SetKeepWhitespaceMode(true);
38 
39   for (const std::string &ForEachMacro : Style.ForEachMacros)
40     ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
41   std::sort(ForEachMacros.begin(), ForEachMacros.end());
42 }
43 
44 ArrayRef<FormatToken *> FormatTokenLexer::lex() {
45   assert(Tokens.empty());
46   assert(FirstInLineIndex == 0);
47   do {
48     Tokens.push_back(getNextToken());
49     if (Style.Language == FormatStyle::LK_JavaScript) {
50       tryParseJSRegexLiteral();
51       handleTemplateStrings();
52     }
53     tryMergePreviousTokens();
54     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
55       FirstInLineIndex = Tokens.size() - 1;
56   } while (Tokens.back()->Tok.isNot(tok::eof));
57   return Tokens;
58 }
59 
60 void FormatTokenLexer::tryMergePreviousTokens() {
61   if (tryMerge_TMacro())
62     return;
63   if (tryMergeConflictMarkers())
64     return;
65   if (tryMergeLessLess())
66     return;
67   if (tryMergeNSStringLiteral())
68     return;
69 
70   if (Style.Language == FormatStyle::LK_JavaScript) {
71     static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
72     static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
73                                                    tok::equal};
74     static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
75                                                   tok::greaterequal};
76     static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
77     static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
78     static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
79                                                            tok::starequal};
80 
81     // FIXME: Investigate what token type gives the correct operator priority.
82     if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
83       return;
84     if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
85       return;
86     if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
87       return;
88     if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
89       return;
90     if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
91       return;
92     if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
93       Tokens.back()->Tok.setKind(tok::starequal);
94       return;
95     }
96   }
97 
98   if (Style.Language == FormatStyle::LK_Java) {
99     static const tok::TokenKind JavaRightLogicalShift[] = {
100         tok::greater, tok::greater, tok::greater};
101     static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
102         tok::greater, tok::greater, tok::greaterequal};
103     if (tryMergeTokens(JavaRightLogicalShift, TT_BinaryOperator))
104       return;
105     if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
106       return;
107   }
108 }
109 
110 bool FormatTokenLexer::tryMergeNSStringLiteral() {
111   if (Tokens.size() < 2)
112     return false;
113   auto &At = *(Tokens.end() - 2);
114   auto &String = *(Tokens.end() - 1);
115   if (!At->is(tok::at) || !String->is(tok::string_literal))
116     return false;
117   At->Tok.setKind(tok::string_literal);
118   At->TokenText = StringRef(At->TokenText.begin(),
119                             String->TokenText.end() - At->TokenText.begin());
120   At->ColumnWidth += String->ColumnWidth;
121   At->Type = TT_ObjCStringLiteral;
122   Tokens.erase(Tokens.end() - 1);
123   return true;
124 }
125 
126 bool FormatTokenLexer::tryMergeLessLess() {
127   // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
128   if (Tokens.size() < 3)
129     return false;
130 
131   bool FourthTokenIsLess = false;
132   if (Tokens.size() > 3)
133     FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
134 
135   auto First = Tokens.end() - 3;
136   if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
137       First[0]->isNot(tok::less) || FourthTokenIsLess)
138     return false;
139 
140   // Only merge if there currently is no whitespace between the two "<".
141   if (First[1]->WhitespaceRange.getBegin() !=
142       First[1]->WhitespaceRange.getEnd())
143     return false;
144 
145   First[0]->Tok.setKind(tok::lessless);
146   First[0]->TokenText = "<<";
147   First[0]->ColumnWidth += 1;
148   Tokens.erase(Tokens.end() - 2);
149   return true;
150 }
151 
152 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
153                                       TokenType NewType) {
154   if (Tokens.size() < Kinds.size())
155     return false;
156 
157   SmallVectorImpl<FormatToken *>::const_iterator First =
158       Tokens.end() - Kinds.size();
159   if (!First[0]->is(Kinds[0]))
160     return false;
161   unsigned AddLength = 0;
162   for (unsigned i = 1; i < Kinds.size(); ++i) {
163     if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
164                                        First[i]->WhitespaceRange.getEnd())
165       return false;
166     AddLength += First[i]->TokenText.size();
167   }
168   Tokens.resize(Tokens.size() - Kinds.size() + 1);
169   First[0]->TokenText = StringRef(First[0]->TokenText.data(),
170                                   First[0]->TokenText.size() + AddLength);
171   First[0]->ColumnWidth += AddLength;
172   First[0]->Type = NewType;
173   return true;
174 }
175 
176 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
177 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
178   // NB: This is not entirely correct, as an r_paren can introduce an operand
179   // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
180   // corner case to not matter in practice, though.
181   return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
182                       tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
183                       tok::colon, tok::question, tok::tilde) ||
184          Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
185                       tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
186                       tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
187          Tok->isBinaryOperator();
188 }
189 
190 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
191   if (!Prev)
192     return true;
193 
194   // Regex literals can only follow after prefix unary operators, not after
195   // postfix unary operators. If the '++' is followed by a non-operand
196   // introducing token, the slash here is the operand and not the start of a
197   // regex.
198   // `!` is an unary prefix operator, but also a post-fix operator that casts
199   // away nullability, so the same check applies.
200   if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
201     return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
202 
203   // The previous token must introduce an operand location where regex
204   // literals can occur.
205   if (!precedesOperand(Prev))
206     return false;
207 
208   return true;
209 }
210 
211 // Tries to parse a JavaScript Regex literal starting at the current token,
212 // if that begins with a slash and is in a location where JavaScript allows
213 // regex literals. Changes the current token to a regex literal and updates
214 // its text if successful.
215 void FormatTokenLexer::tryParseJSRegexLiteral() {
216   FormatToken *RegexToken = Tokens.back();
217   if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
218     return;
219 
220   FormatToken *Prev = nullptr;
221   for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
222     // NB: Because previous pointers are not initialized yet, this cannot use
223     // Token.getPreviousNonComment.
224     if ((*I)->isNot(tok::comment)) {
225       Prev = *I;
226       break;
227     }
228   }
229 
230   if (!canPrecedeRegexLiteral(Prev))
231     return;
232 
233   // 'Manually' lex ahead in the current file buffer.
234   const char *Offset = Lex->getBufferLocation();
235   const char *RegexBegin = Offset - RegexToken->TokenText.size();
236   StringRef Buffer = Lex->getBuffer();
237   bool InCharacterClass = false;
238   bool HaveClosingSlash = false;
239   for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
240     // Regular expressions are terminated with a '/', which can only be
241     // escaped using '\' or a character class between '[' and ']'.
242     // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
243     switch (*Offset) {
244     case '\\':
245       // Skip the escaped character.
246       ++Offset;
247       break;
248     case '[':
249       InCharacterClass = true;
250       break;
251     case ']':
252       InCharacterClass = false;
253       break;
254     case '/':
255       if (!InCharacterClass)
256         HaveClosingSlash = true;
257       break;
258     }
259   }
260 
261   RegexToken->Type = TT_RegexLiteral;
262   // Treat regex literals like other string_literals.
263   RegexToken->Tok.setKind(tok::string_literal);
264   RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
265   RegexToken->ColumnWidth = RegexToken->TokenText.size();
266 
267   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
268 }
269 
270 void FormatTokenLexer::handleTemplateStrings() {
271   FormatToken *BacktickToken = Tokens.back();
272 
273   if (BacktickToken->is(tok::l_brace)) {
274     StateStack.push(LexerState::NORMAL);
275     return;
276   }
277   if (BacktickToken->is(tok::r_brace)) {
278     if (StateStack.size() == 1)
279       return;
280     StateStack.pop();
281     if (StateStack.top() != LexerState::TEMPLATE_STRING)
282       return;
283     // If back in TEMPLATE_STRING, fallthrough and continue parsing the
284   } else if (BacktickToken->is(tok::unknown) &&
285              BacktickToken->TokenText == "`") {
286     StateStack.push(LexerState::TEMPLATE_STRING);
287   } else {
288     return; // Not actually a template
289   }
290 
291   // 'Manually' lex ahead in the current file buffer.
292   const char *Offset = Lex->getBufferLocation();
293   const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
294   for (; Offset != Lex->getBuffer().end(); ++Offset) {
295     if (Offset[0] == '`') {
296       StateStack.pop();
297       break;
298     }
299     if (Offset[0] == '\\') {
300       ++Offset; // Skip the escaped character.
301     } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
302                Offset[1] == '{') {
303       // '${' introduces an expression interpolation in the template string.
304       StateStack.push(LexerState::NORMAL);
305       ++Offset;
306       break;
307     }
308   }
309 
310   StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
311   BacktickToken->Type = TT_TemplateString;
312   BacktickToken->Tok.setKind(tok::string_literal);
313   BacktickToken->TokenText = LiteralText;
314 
315   // Adjust width for potentially multiline string literals.
316   size_t FirstBreak = LiteralText.find('\n');
317   StringRef FirstLineText = FirstBreak == StringRef::npos
318                                 ? LiteralText
319                                 : LiteralText.substr(0, FirstBreak);
320   BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
321       FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
322   size_t LastBreak = LiteralText.rfind('\n');
323   if (LastBreak != StringRef::npos) {
324     BacktickToken->IsMultiline = true;
325     unsigned StartColumn = 0; // The template tail spans the entire line.
326     BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
327         LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
328         Style.TabWidth, Encoding);
329   }
330 
331   SourceLocation loc = Offset < Lex->getBuffer().end()
332                            ? Lex->getSourceLocation(Offset + 1)
333                            : SourceMgr.getLocForEndOfFile(ID);
334   resetLexer(SourceMgr.getFileOffset(loc));
335 }
336 
337 bool FormatTokenLexer::tryMerge_TMacro() {
338   if (Tokens.size() < 4)
339     return false;
340   FormatToken *Last = Tokens.back();
341   if (!Last->is(tok::r_paren))
342     return false;
343 
344   FormatToken *String = Tokens[Tokens.size() - 2];
345   if (!String->is(tok::string_literal) || String->IsMultiline)
346     return false;
347 
348   if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
349     return false;
350 
351   FormatToken *Macro = Tokens[Tokens.size() - 4];
352   if (Macro->TokenText != "_T")
353     return false;
354 
355   const char *Start = Macro->TokenText.data();
356   const char *End = Last->TokenText.data() + Last->TokenText.size();
357   String->TokenText = StringRef(Start, End - Start);
358   String->IsFirst = Macro->IsFirst;
359   String->LastNewlineOffset = Macro->LastNewlineOffset;
360   String->WhitespaceRange = Macro->WhitespaceRange;
361   String->OriginalColumn = Macro->OriginalColumn;
362   String->ColumnWidth = encoding::columnWidthWithTabs(
363       String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
364   String->NewlinesBefore = Macro->NewlinesBefore;
365   String->HasUnescapedNewline = Macro->HasUnescapedNewline;
366 
367   Tokens.pop_back();
368   Tokens.pop_back();
369   Tokens.pop_back();
370   Tokens.back() = String;
371   return true;
372 }
373 
374 bool FormatTokenLexer::tryMergeConflictMarkers() {
375   if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
376     return false;
377 
378   // Conflict lines look like:
379   // <marker> <text from the vcs>
380   // For example:
381   // >>>>>>> /file/in/file/system at revision 1234
382   //
383   // We merge all tokens in a line that starts with a conflict marker
384   // into a single token with a special token type that the unwrapped line
385   // parser will use to correctly rebuild the underlying code.
386 
387   FileID ID;
388   // Get the position of the first token in the line.
389   unsigned FirstInLineOffset;
390   std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
391       Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
392   StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
393   // Calculate the offset of the start of the current line.
394   auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
395   if (LineOffset == StringRef::npos) {
396     LineOffset = 0;
397   } else {
398     ++LineOffset;
399   }
400 
401   auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
402   StringRef LineStart;
403   if (FirstSpace == StringRef::npos) {
404     LineStart = Buffer.substr(LineOffset);
405   } else {
406     LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
407   }
408 
409   TokenType Type = TT_Unknown;
410   if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
411     Type = TT_ConflictStart;
412   } else if (LineStart == "|||||||" || LineStart == "=======" ||
413              LineStart == "====") {
414     Type = TT_ConflictAlternative;
415   } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
416     Type = TT_ConflictEnd;
417   }
418 
419   if (Type != TT_Unknown) {
420     FormatToken *Next = Tokens.back();
421 
422     Tokens.resize(FirstInLineIndex + 1);
423     // We do not need to build a complete token here, as we will skip it
424     // during parsing anyway (as we must not touch whitespace around conflict
425     // markers).
426     Tokens.back()->Type = Type;
427     Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
428 
429     Tokens.push_back(Next);
430     return true;
431   }
432 
433   return false;
434 }
435 
436 FormatToken *FormatTokenLexer::getStashedToken() {
437   // Create a synthesized second '>' or '<' token.
438   Token Tok = FormatTok->Tok;
439   StringRef TokenText = FormatTok->TokenText;
440 
441   unsigned OriginalColumn = FormatTok->OriginalColumn;
442   FormatTok = new (Allocator.Allocate()) FormatToken;
443   FormatTok->Tok = Tok;
444   SourceLocation TokLocation =
445       FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
446   FormatTok->Tok.setLocation(TokLocation);
447   FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
448   FormatTok->TokenText = TokenText;
449   FormatTok->ColumnWidth = 1;
450   FormatTok->OriginalColumn = OriginalColumn + 1;
451 
452   return FormatTok;
453 }
454 
455 FormatToken *FormatTokenLexer::getNextToken() {
456   if (StateStack.top() == LexerState::TOKEN_STASHED) {
457     StateStack.pop();
458     return getStashedToken();
459   }
460 
461   FormatTok = new (Allocator.Allocate()) FormatToken;
462   readRawToken(*FormatTok);
463   SourceLocation WhitespaceStart =
464       FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
465   FormatTok->IsFirst = IsFirstToken;
466   IsFirstToken = false;
467 
468   // Consume and record whitespace until we find a significant token.
469   unsigned WhitespaceLength = TrailingWhitespace;
470   while (FormatTok->Tok.is(tok::unknown)) {
471     StringRef Text = FormatTok->TokenText;
472     auto EscapesNewline = [&](int pos) {
473       // A '\r' here is just part of '\r\n'. Skip it.
474       if (pos >= 0 && Text[pos] == '\r')
475         --pos;
476       // See whether there is an odd number of '\' before this.
477       // FIXME: This is wrong. A '\' followed by a newline is always removed,
478       // regardless of whether there is another '\' before it.
479       // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
480       unsigned count = 0;
481       for (; pos >= 0; --pos, ++count)
482         if (Text[pos] != '\\')
483           break;
484       return count & 1;
485     };
486     // FIXME: This miscounts tok:unknown tokens that are not just
487     // whitespace, e.g. a '`' character.
488     for (int i = 0, e = Text.size(); i != e; ++i) {
489       switch (Text[i]) {
490       case '\n':
491         ++FormatTok->NewlinesBefore;
492         FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
493         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
494         Column = 0;
495         break;
496       case '\r':
497         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
498         Column = 0;
499         break;
500       case '\f':
501       case '\v':
502         Column = 0;
503         break;
504       case ' ':
505         ++Column;
506         break;
507       case '\t':
508         Column += Style.TabWidth - Column % Style.TabWidth;
509         break;
510       case '\\':
511         if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
512           FormatTok->Type = TT_ImplicitStringLiteral;
513         break;
514       default:
515         FormatTok->Type = TT_ImplicitStringLiteral;
516         break;
517       }
518       if (FormatTok->Type == TT_ImplicitStringLiteral)
519         break;
520     }
521 
522     if (FormatTok->is(TT_ImplicitStringLiteral))
523       break;
524     WhitespaceLength += FormatTok->Tok.getLength();
525 
526     readRawToken(*FormatTok);
527   }
528 
529   // JavaScript and Java do not allow to escape the end of the line with a
530   // backslash. Backslashes are syntax errors in plain source, but can occur in
531   // comments. When a single line comment ends with a \, it'll cause the next
532   // line of code to be lexed as a comment, breaking formatting. The code below
533   // finds comments that contain a backslash followed by a line break, truncates
534   // the comment token at the backslash, and resets the lexer to restart behind
535   // the backslash.
536   if ((Style.Language == FormatStyle::LK_JavaScript ||
537        Style.Language == FormatStyle::LK_Java) &&
538       FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
539     size_t BackslashPos = FormatTok->TokenText.find('\\');
540     while (BackslashPos != StringRef::npos) {
541       if (BackslashPos + 1 < FormatTok->TokenText.size() &&
542           FormatTok->TokenText[BackslashPos + 1] == '\n') {
543         const char *Offset = Lex->getBufferLocation();
544         Offset -= FormatTok->TokenText.size();
545         Offset += BackslashPos + 1;
546         resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
547         FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
548         FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
549             FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
550             Encoding);
551         break;
552       }
553       BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
554     }
555   }
556 
557   // In case the token starts with escaped newlines, we want to
558   // take them into account as whitespace - this pattern is quite frequent
559   // in macro definitions.
560   // FIXME: Add a more explicit test.
561   while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
562          FormatTok->TokenText[1] == '\n') {
563     ++FormatTok->NewlinesBefore;
564     WhitespaceLength += 2;
565     FormatTok->LastNewlineOffset = 2;
566     Column = 0;
567     FormatTok->TokenText = FormatTok->TokenText.substr(2);
568   }
569 
570   FormatTok->WhitespaceRange = SourceRange(
571       WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
572 
573   FormatTok->OriginalColumn = Column;
574 
575   TrailingWhitespace = 0;
576   if (FormatTok->Tok.is(tok::comment)) {
577     // FIXME: Add the trimmed whitespace to Column.
578     StringRef UntrimmedText = FormatTok->TokenText;
579     FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
580     TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
581   } else if (FormatTok->Tok.is(tok::raw_identifier)) {
582     IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
583     FormatTok->Tok.setIdentifierInfo(&Info);
584     FormatTok->Tok.setKind(Info.getTokenID());
585     if (Style.Language == FormatStyle::LK_Java &&
586         FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
587                            tok::kw_operator)) {
588       FormatTok->Tok.setKind(tok::identifier);
589       FormatTok->Tok.setIdentifierInfo(nullptr);
590     } else if (Style.Language == FormatStyle::LK_JavaScript &&
591                FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
592                                   tok::kw_operator)) {
593       FormatTok->Tok.setKind(tok::identifier);
594       FormatTok->Tok.setIdentifierInfo(nullptr);
595     }
596   } else if (FormatTok->Tok.is(tok::greatergreater)) {
597     FormatTok->Tok.setKind(tok::greater);
598     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
599     ++Column;
600     StateStack.push(LexerState::TOKEN_STASHED);
601   } else if (FormatTok->Tok.is(tok::lessless)) {
602     FormatTok->Tok.setKind(tok::less);
603     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
604     ++Column;
605     StateStack.push(LexerState::TOKEN_STASHED);
606   }
607 
608   // Now FormatTok is the next non-whitespace token.
609 
610   StringRef Text = FormatTok->TokenText;
611   size_t FirstNewlinePos = Text.find('\n');
612   if (FirstNewlinePos == StringRef::npos) {
613     // FIXME: ColumnWidth actually depends on the start column, we need to
614     // take this into account when the token is moved.
615     FormatTok->ColumnWidth =
616         encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
617     Column += FormatTok->ColumnWidth;
618   } else {
619     FormatTok->IsMultiline = true;
620     // FIXME: ColumnWidth actually depends on the start column, we need to
621     // take this into account when the token is moved.
622     FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
623         Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
624 
625     // The last line of the token always starts in column 0.
626     // Thus, the length can be precomputed even in the presence of tabs.
627     FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
628         Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
629     Column = FormatTok->LastLineColumnWidth;
630   }
631 
632   if (Style.isCpp()) {
633     if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
634           Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
635               tok::pp_define) &&
636         std::find(ForEachMacros.begin(), ForEachMacros.end(),
637                   FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) {
638       FormatTok->Type = TT_ForEachMacro;
639     } else if (FormatTok->is(tok::identifier)) {
640       if (MacroBlockBeginRegex.match(Text)) {
641         FormatTok->Type = TT_MacroBlockBegin;
642       } else if (MacroBlockEndRegex.match(Text)) {
643         FormatTok->Type = TT_MacroBlockEnd;
644       }
645     }
646   }
647 
648   return FormatTok;
649 }
650 
651 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
652   Lex->LexFromRawLexer(Tok.Tok);
653   Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
654                             Tok.Tok.getLength());
655   // For formatting, treat unterminated string literals like normal string
656   // literals.
657   if (Tok.is(tok::unknown)) {
658     if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
659       Tok.Tok.setKind(tok::string_literal);
660       Tok.IsUnterminatedLiteral = true;
661     } else if (Style.Language == FormatStyle::LK_JavaScript &&
662                Tok.TokenText == "''") {
663       Tok.Tok.setKind(tok::string_literal);
664     }
665   }
666 
667   if (Style.Language == FormatStyle::LK_JavaScript &&
668       Tok.is(tok::char_constant)) {
669     Tok.Tok.setKind(tok::string_literal);
670   }
671 
672   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
673                                Tok.TokenText == "/* clang-format on */")) {
674     FormattingDisabled = false;
675   }
676 
677   Tok.Finalized = FormattingDisabled;
678 
679   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
680                                Tok.TokenText == "/* clang-format off */")) {
681     FormattingDisabled = true;
682   }
683 }
684 
685 void FormatTokenLexer::resetLexer(unsigned Offset) {
686   StringRef Buffer = SourceMgr.getBufferData(ID);
687   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
688                       getFormattingLangOpts(Style), Buffer.begin(),
689                       Buffer.begin() + Offset, Buffer.end()));
690   Lex->SetKeepWhitespaceMode(true);
691   TrailingWhitespace = 0;
692 }
693 
694 } // namespace format
695 } // namespace clang
696