1 //===--- BreakableToken.cpp - Format C++ code -----------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Contains implementation of BreakableToken class and classes derived
12 /// from it.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #define DEBUG_TYPE "format-token-breaker"
17 
18 #include "BreakableToken.h"
19 #include "clang/Basic/CharInfo.h"
20 #include "clang/Format/Format.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/Support/Debug.h"
23 #include <algorithm>
24 
25 namespace clang {
26 namespace format {
27 namespace {
28 
29 static const char *Blanks = " \t\v\f";
30 static bool IsBlank(char C) {
31   switch (C) {
32     case ' ':
33     case '\t':
34     case '\v':
35     case '\f':
36       return true;
37     default:
38       return false;
39   }
40 }
41 
42 BreakableToken::Split getCommentSplit(StringRef Text,
43                                       unsigned ContentStartColumn,
44                                       unsigned ColumnLimit,
45                                       encoding::Encoding Encoding) {
46   if (ColumnLimit <= ContentStartColumn + 1)
47     return BreakableToken::Split(StringRef::npos, 0);
48 
49   unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
50   unsigned MaxSplitBytes = 0;
51 
52   for (unsigned NumChars = 0;
53        NumChars < MaxSplit && MaxSplitBytes < Text.size(); ++NumChars)
54     MaxSplitBytes +=
55         encoding::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
56 
57   StringRef::size_type SpaceOffset = Text.find_last_of(Blanks, MaxSplitBytes);
58   if (SpaceOffset == StringRef::npos ||
59       // Don't break at leading whitespace.
60       Text.find_last_not_of(Blanks, SpaceOffset) == StringRef::npos) {
61     // Make sure that we don't break at leading whitespace that
62     // reaches past MaxSplit.
63     StringRef::size_type FirstNonWhitespace = Text.find_first_not_of(Blanks);
64     if (FirstNonWhitespace == StringRef::npos)
65       // If the comment is only whitespace, we cannot split.
66       return BreakableToken::Split(StringRef::npos, 0);
67     SpaceOffset = Text.find_first_of(
68         Blanks, std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
69   }
70   if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
71     StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim(Blanks);
72     StringRef AfterCut = Text.substr(SpaceOffset).ltrim(Blanks);
73     return BreakableToken::Split(BeforeCut.size(),
74                                  AfterCut.begin() - BeforeCut.end());
75   }
76   return BreakableToken::Split(StringRef::npos, 0);
77 }
78 
79 BreakableToken::Split getStringSplit(StringRef Text,
80                                      unsigned ContentStartColumn,
81                                      unsigned ColumnLimit,
82                                      encoding::Encoding Encoding) {
83   // FIXME: Reduce unit test case.
84   if (Text.empty())
85     return BreakableToken::Split(StringRef::npos, 0);
86   if (ColumnLimit <= ContentStartColumn)
87     return BreakableToken::Split(StringRef::npos, 0);
88   unsigned MaxSplit =
89       std::min<unsigned>(ColumnLimit - ContentStartColumn,
90                          encoding::getCodePointCount(Text, Encoding) - 1);
91   StringRef::size_type SpaceOffset = 0;
92   StringRef::size_type SlashOffset = 0;
93   StringRef::size_type WordStartOffset = 0;
94   StringRef::size_type SplitPoint = 0;
95   for (unsigned Chars = 0;;) {
96     unsigned Advance;
97     if (Text[0] == '\\') {
98       Advance = encoding::getEscapeSequenceLength(Text);
99       Chars += Advance;
100     } else {
101       Advance = encoding::getCodePointNumBytes(Text[0], Encoding);
102       Chars += 1;
103     }
104 
105     if (Chars > MaxSplit)
106       break;
107 
108     if (IsBlank(Text[0]))
109       SpaceOffset = SplitPoint;
110     if (Text[0] == '/')
111       SlashOffset = SplitPoint;
112     if (Advance == 1 && !isAlphanumeric(Text[0]))
113       WordStartOffset = SplitPoint;
114 
115     SplitPoint += Advance;
116     Text = Text.substr(Advance);
117   }
118 
119   if (SpaceOffset != 0)
120     return BreakableToken::Split(SpaceOffset + 1, 0);
121   if (SlashOffset != 0)
122     return BreakableToken::Split(SlashOffset + 1, 0);
123   if (WordStartOffset != 0)
124     return BreakableToken::Split(WordStartOffset + 1, 0);
125   if (SplitPoint != 0)
126     return BreakableToken::Split(SplitPoint, 0);
127   return BreakableToken::Split(StringRef::npos, 0);
128 }
129 
130 } // namespace
131 
132 unsigned BreakableSingleLineToken::getLineCount() const { return 1; }
133 
134 unsigned BreakableSingleLineToken::getLineLengthAfterSplit(
135     unsigned LineIndex, unsigned Offset, StringRef::size_type Length) const {
136   return StartColumn + Prefix.size() + Postfix.size() +
137          encoding::getCodePointCount(Line.substr(Offset, Length), Encoding);
138 }
139 
140 BreakableSingleLineToken::BreakableSingleLineToken(
141     const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
142     StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding)
143     : BreakableToken(Tok, InPPDirective, Encoding), StartColumn(StartColumn),
144       Prefix(Prefix), Postfix(Postfix) {
145   assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
146   Line = Tok.TokenText.substr(
147       Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
148 }
149 
150 BreakableStringLiteral::BreakableStringLiteral(const FormatToken &Tok,
151                                                unsigned StartColumn,
152                                                bool InPPDirective,
153                                                encoding::Encoding Encoding)
154     : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"", InPPDirective,
155                                Encoding) {}
156 
157 BreakableToken::Split
158 BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
159                                  unsigned ColumnLimit) const {
160   return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit,
161                         Encoding);
162 }
163 
164 void BreakableStringLiteral::insertBreak(unsigned LineIndex,
165                                          unsigned TailOffset, Split Split,
166                                          WhitespaceManager &Whitespaces) {
167   Whitespaces.replaceWhitespaceInToken(
168       Tok, Prefix.size() + TailOffset + Split.first, Split.second, Postfix,
169       Prefix, InPPDirective, 1, StartColumn);
170 }
171 
172 static StringRef getLineCommentPrefix(StringRef Comment) {
173   const char *KnownPrefixes[] = { "/// ", "///", "// ", "//" };
174   for (size_t i = 0, e = llvm::array_lengthof(KnownPrefixes); i != e; ++i)
175     if (Comment.startswith(KnownPrefixes[i]))
176       return KnownPrefixes[i];
177   return "";
178 }
179 
180 BreakableLineComment::BreakableLineComment(const FormatToken &Token,
181                                            unsigned StartColumn,
182                                            bool InPPDirective,
183                                            encoding::Encoding Encoding)
184     : BreakableSingleLineToken(Token, StartColumn,
185                                getLineCommentPrefix(Token.TokenText), "",
186                                InPPDirective, Encoding) {
187   OriginalPrefix = Prefix;
188   if (Token.TokenText.size() > Prefix.size() &&
189       isAlphanumeric(Token.TokenText[Prefix.size()])) {
190     if (Prefix == "//")
191       Prefix = "// ";
192     else if (Prefix == "///")
193       Prefix = "/// ";
194   }
195 }
196 
197 BreakableToken::Split
198 BreakableLineComment::getSplit(unsigned LineIndex, unsigned TailOffset,
199                                unsigned ColumnLimit) const {
200   return getCommentSplit(Line.substr(TailOffset), StartColumn + Prefix.size(),
201                          ColumnLimit, Encoding);
202 }
203 
204 void BreakableLineComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
205                                        Split Split,
206                                        WhitespaceManager &Whitespaces) {
207   Whitespaces.replaceWhitespaceInToken(
208       Tok, OriginalPrefix.size() + TailOffset + Split.first, Split.second,
209       Postfix, Prefix, InPPDirective, 1, StartColumn);
210 }
211 
212 void
213 BreakableLineComment::replaceWhitespaceBefore(unsigned LineIndex,
214                                               WhitespaceManager &Whitespaces) {
215   if (OriginalPrefix != Prefix) {
216     Whitespaces.replaceWhitespaceInToken(Tok, OriginalPrefix.size(), 0, "", "",
217                                          false, 0, 1);
218   }
219 }
220 
221 BreakableBlockComment::BreakableBlockComment(
222     const FormatStyle &Style, const FormatToken &Token, unsigned StartColumn,
223     unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective,
224     encoding::Encoding Encoding)
225     : BreakableToken(Token, InPPDirective, Encoding) {
226   StringRef TokenText(Token.TokenText);
227   assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
228   TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");
229 
230   int IndentDelta = StartColumn - OriginalStartColumn;
231   bool NeedsStar = true;
232   LeadingWhitespace.resize(Lines.size());
233   StartOfLineColumn.resize(Lines.size());
234   if (Lines.size() == 1 && !FirstInLine) {
235     // Comments for which FirstInLine is false can start on arbitrary column,
236     // and available horizontal space can be too small to align consecutive
237     // lines with the first one.
238     // FIXME: We could, probably, align them to current indentation level, but
239     // now we just wrap them without stars.
240     NeedsStar = false;
241   }
242   StartOfLineColumn[0] = StartColumn + 2;
243   for (size_t i = 1; i < Lines.size(); ++i) {
244     adjustWhitespace(Style, i, IndentDelta);
245     if (Lines[i].empty())
246       // If the last line is empty, the closing "*/" will have a star.
247       NeedsStar = NeedsStar && i + 1 == Lines.size();
248     else
249       NeedsStar = NeedsStar && Lines[i][0] == '*';
250   }
251   Decoration = NeedsStar ? "* " : "";
252   IndentAtLineBreak = StartOfLineColumn[0] + 1;
253   for (size_t i = 1; i < Lines.size(); ++i) {
254     if (Lines[i].empty()) {
255       if (!NeedsStar && i + 1 != Lines.size())
256         // For all but the last line (which always ends in */), set the
257         // start column to 0 if they're empty, so we do not insert
258         // trailing whitespace anywhere.
259         StartOfLineColumn[i] = 0;
260       continue;
261     }
262     if (NeedsStar) {
263       // The first line already excludes the star.
264       // For all other lines, adjust the line to exclude the star and
265       // (optionally) the first whitespace.
266       int Offset = Lines[i].startswith("* ") ? 2 : 1;
267       StartOfLineColumn[i] += Offset;
268       Lines[i] = Lines[i].substr(Offset);
269       LeadingWhitespace[i] += Offset;
270     }
271     IndentAtLineBreak = std::min<int>(IndentAtLineBreak, StartOfLineColumn[i]);
272   }
273   IndentAtLineBreak = std::max<unsigned>(IndentAtLineBreak, Decoration.size());
274   DEBUG({
275     for (size_t i = 0; i < Lines.size(); ++i) {
276       llvm::dbgs() << i << " |" << Lines[i] << "| " << LeadingWhitespace[i]
277                    << "\n";
278     }
279   });
280 }
281 
282 void BreakableBlockComment::adjustWhitespace(const FormatStyle &Style,
283                                              unsigned LineIndex,
284                                              int IndentDelta) {
285   // When in a preprocessor directive, the trailing backslash in a block comment
286   // is not needed, but can serve a purpose of uniformity with necessary escaped
287   // newlines outside the comment. In this case we remove it here before
288   // trimming the trailing whitespace. The backslash will be re-added later when
289   // inserting a line break.
290   size_t EndOfPreviousLine = Lines[LineIndex - 1].size();
291   if (InPPDirective && Lines[LineIndex - 1].endswith("\\"))
292     --EndOfPreviousLine;
293 
294   // Calculate the end of the non-whitespace text in the previous line.
295   EndOfPreviousLine =
296       Lines[LineIndex - 1].find_last_not_of(Blanks, EndOfPreviousLine);
297   if (EndOfPreviousLine == StringRef::npos)
298     EndOfPreviousLine = 0;
299   else
300     ++EndOfPreviousLine;
301   // Calculate the start of the non-whitespace text in the current line.
302   size_t StartOfLine = Lines[LineIndex].find_first_not_of(Blanks);
303   if (StartOfLine == StringRef::npos)
304     StartOfLine = Lines[LineIndex].size();
305 
306   // Adjust Lines to only contain relevant text.
307   Lines[LineIndex - 1] = Lines[LineIndex - 1].substr(0, EndOfPreviousLine);
308   Lines[LineIndex] = Lines[LineIndex].substr(StartOfLine);
309   // Adjust LeadingWhitespace to account all whitespace between the lines
310   // to the current line.
311   LeadingWhitespace[LineIndex] =
312       Lines[LineIndex].begin() - Lines[LineIndex - 1].end();
313 
314   // FIXME: We currently count tabs as 1 character. To solve this, we need to
315   // get the correct indentation width of the start of the comment, which
316   // requires correct counting of the tab expansions before the comment, and
317   // a configurable tab width. Since the current implementation only breaks
318   // if leading tabs are intermixed with spaces, that is not a high priority.
319 
320   // Adjust the start column uniformly accross all lines.
321   StartOfLineColumn[LineIndex] = std::max<int>(0, StartOfLine + IndentDelta);
322 }
323 
324 unsigned BreakableBlockComment::getLineCount() const { return Lines.size(); }
325 
326 unsigned BreakableBlockComment::getLineLengthAfterSplit(
327     unsigned LineIndex, unsigned Offset, StringRef::size_type Length) const {
328   return getContentStartColumn(LineIndex, Offset) +
329          encoding::getCodePointCount(Lines[LineIndex].substr(Offset, Length),
330                                      Encoding) +
331          // The last line gets a "*/" postfix.
332          (LineIndex + 1 == Lines.size() ? 2 : 0);
333 }
334 
335 BreakableToken::Split
336 BreakableBlockComment::getSplit(unsigned LineIndex, unsigned TailOffset,
337                                 unsigned ColumnLimit) const {
338   return getCommentSplit(Lines[LineIndex].substr(TailOffset),
339                          getContentStartColumn(LineIndex, TailOffset),
340                          ColumnLimit, Encoding);
341 }
342 
343 void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
344                                         Split Split,
345                                         WhitespaceManager &Whitespaces) {
346   StringRef Text = Lines[LineIndex].substr(TailOffset);
347   StringRef Prefix = Decoration;
348   if (LineIndex + 1 == Lines.size() &&
349       Text.size() == Split.first + Split.second) {
350     // For the last line we need to break before "*/", but not to add "* ".
351     Prefix = "";
352   }
353 
354   unsigned BreakOffsetInToken =
355       Text.data() - Tok.TokenText.data() + Split.first;
356   unsigned CharsToRemove = Split.second;
357   assert(IndentAtLineBreak >= Decoration.size());
358   Whitespaces.replaceWhitespaceInToken(Tok, BreakOffsetInToken, CharsToRemove,
359                                        "", Prefix, InPPDirective, 1,
360                                        IndentAtLineBreak - Decoration.size());
361 }
362 
363 void
364 BreakableBlockComment::replaceWhitespaceBefore(unsigned LineIndex,
365                                                WhitespaceManager &Whitespaces) {
366   if (LineIndex == 0)
367     return;
368   StringRef Prefix = Decoration;
369   if (Lines[LineIndex].empty()) {
370     if (LineIndex + 1 == Lines.size()) {
371       // If the last line is empty, we don't need a prefix, as the */ will line
372       // up with the decoration (if it exists).
373       Prefix = "";
374     } else if (!Decoration.empty()) {
375       // For other empty lines, if we do have a decoration, adapt it to not
376       // contain a trailing whitespace.
377       Prefix = Prefix.substr(0, 1);
378     }
379   } else {
380     if (StartOfLineColumn[LineIndex] == 1) {
381       // This lines starts immediately after the decorating *.
382       Prefix = Prefix.substr(0, 1);
383     }
384   }
385 
386   unsigned WhitespaceOffsetInToken =
387       Lines[LineIndex].data() - Tok.TokenText.data() -
388       LeadingWhitespace[LineIndex];
389   assert(StartOfLineColumn[LineIndex] >= Prefix.size());
390   Whitespaces.replaceWhitespaceInToken(
391       Tok, WhitespaceOffsetInToken, LeadingWhitespace[LineIndex], "", Prefix,
392       InPPDirective, 1, StartOfLineColumn[LineIndex] - Prefix.size());
393 }
394 
395 unsigned
396 BreakableBlockComment::getContentStartColumn(unsigned LineIndex,
397                                              unsigned TailOffset) const {
398   // If we break, we always break at the predefined indent.
399   if (TailOffset != 0)
400     return IndentAtLineBreak;
401   return StartOfLineColumn[LineIndex];
402 }
403 
404 } // namespace format
405 } // namespace clang
406