1 //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "clang-pseudo/Token.h" 10 #include "clang/Basic/IdentifierTable.h" 11 #include "clang/Basic/SourceLocation.h" 12 #include "clang/Basic/TokenKinds.h" 13 #include "clang/Lex/Lexer.h" 14 #include "clang/Lex/LiteralSupport.h" 15 16 namespace clang { 17 namespace pseudo { 18 19 TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) { 20 clang::SourceLocation Start; 21 // Tokenize using clang's lexer in raw mode. 22 // std::string guarantees null-termination, which the lexer needs. 23 clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(), 24 Code.data() + Code.size()); 25 Lexer.SetCommentRetentionState(true); 26 27 TokenStream Result; 28 clang::Token CT; 29 unsigned LastOffset = 0; 30 unsigned Line = 0; 31 unsigned Indent = 0; 32 for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof; 33 Lexer.LexFromRawLexer(CT)) { 34 unsigned Offset = 35 CT.getLocation().getRawEncoding() - Start.getRawEncoding(); 36 37 Token Tok; 38 Tok.Data = &Code[Offset]; 39 Tok.Length = CT.getLength(); 40 Tok.Kind = CT.getKind(); 41 42 // Update current line number and indentation from raw source code. 43 unsigned NewLineStart = 0; 44 for (unsigned I = LastOffset; I < Offset; ++I) { 45 if (Code[I] == '\n') { 46 NewLineStart = I + 1; 47 ++Line; 48 } 49 } 50 if (NewLineStart || !LastOffset) { 51 Indent = 0; 52 for (char C : StringRef(Code).slice(NewLineStart, Offset)) { 53 if (C == ' ') 54 ++Indent; 55 else if (C == '\t') 56 Indent += 8; 57 else 58 break; 59 } 60 } 61 Tok.Indent = Indent; 62 Tok.Line = Line; 63 64 if (CT.isAtStartOfLine()) 65 Tok.setFlag(LexFlags::StartsPPLine); 66 if (CT.needsCleaning() || CT.hasUCN()) 67 Tok.setFlag(LexFlags::NeedsCleaning); 68 69 Result.push(Tok); 70 LastOffset = Offset; 71 } 72 Result.finalize(); 73 return Result; 74 } 75 76 TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) { 77 auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>(); 78 clang::IdentifierTable Identifiers(LangOpts); 79 TokenStream Result(CleanedStorage); 80 81 for (auto Tok : Code.tokens()) { 82 if (Tok.flag(LexFlags::NeedsCleaning)) { 83 // Remove escaped newlines and trigraphs. 84 llvm::SmallString<64> CleanBuffer; 85 const char *Pos = Tok.text().begin(); 86 while (Pos < Tok.text().end()) { 87 unsigned CharSize = 0; 88 CleanBuffer.push_back( 89 clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts)); 90 assert(CharSize != 0 && "no progress!"); 91 Pos += CharSize; 92 } 93 // Remove universal character names (UCN). 94 llvm::SmallString<64> UCNBuffer; 95 clang::expandUCNs(UCNBuffer, CleanBuffer); 96 97 llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage); 98 Tok.Data = Text.data(); 99 Tok.Length = Text.size(); 100 Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning); 101 } 102 103 if (Tok.Kind == tok::raw_identifier) { 104 // Cook raw_identifiers into identifier, keyword, etc. 105 Tok.Kind = Identifiers.get(Tok.text()).getTokenID(); 106 } else if (Tok.Kind == tok::greatergreater) { 107 // Split the greatergreater token. 108 // FIXME: split lessless token to support Cuda triple angle brackets <<<. 109 assert(Tok.text() == ">>"); 110 Tok.Kind = tok::greater; 111 Tok.Length = 1; 112 Result.push(Tok); 113 // Line is wrong if the first greater is followed by an escaped newline! 114 Tok.Data = Tok.text().data() + 1; 115 } 116 117 Result.push(std::move(Tok)); 118 } 119 120 Result.finalize(); 121 return Result; 122 } 123 124 } // namespace pseudo 125 } // namespace clang 126