1 //===- BuildTree.cpp ------------------------------------------*- C++ -*-=====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "clang/Tooling/Syntax/BuildTree.h" 9 #include "clang/AST/RecursiveASTVisitor.h" 10 #include "clang/AST/Stmt.h" 11 #include "clang/Basic/LLVM.h" 12 #include "clang/Basic/SourceLocation.h" 13 #include "clang/Basic/SourceManager.h" 14 #include "clang/Basic/TokenKinds.h" 15 #include "clang/Lex/Lexer.h" 16 #include "clang/Tooling/Syntax/Nodes.h" 17 #include "clang/Tooling/Syntax/Tokens.h" 18 #include "clang/Tooling/Syntax/Tree.h" 19 #include "llvm/ADT/ArrayRef.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/Support/Allocator.h" 23 #include "llvm/Support/Casting.h" 24 #include "llvm/Support/Compiler.h" 25 #include "llvm/Support/FormatVariadic.h" 26 #include "llvm/Support/raw_ostream.h" 27 #include <map> 28 29 using namespace clang; 30 31 LLVM_ATTRIBUTE_UNUSED 32 static bool isImplicitExpr(clang::Expr *E) { return E->IgnoreImplicit() != E; } 33 34 /// A helper class for constructing the syntax tree while traversing a clang 35 /// AST. 36 /// 37 /// At each point of the traversal we maintain a list of pending nodes. 38 /// Initially all tokens are added as pending nodes. When processing a clang AST 39 /// node, the clients need to: 40 /// - create a corresponding syntax node, 41 /// - assign roles to all pending child nodes with 'markChild' and 42 /// 'markChildToken', 43 /// - replace the child nodes with the new syntax node in the pending list 44 /// with 'foldNode'. 45 /// 46 /// Note that all children are expected to be processed when building a node. 47 /// 48 /// Call finalize() to finish building the tree and consume the root node. 49 class syntax::TreeBuilder { 50 public: 51 TreeBuilder(syntax::Arena &Arena) : Arena(Arena), Pending(Arena) {} 52 53 llvm::BumpPtrAllocator &allocator() { return Arena.allocator(); } 54 55 /// Populate children for \p New node, assuming it covers tokens from \p 56 /// Range. 57 void foldNode(llvm::ArrayRef<syntax::Token> Range, syntax::Tree *New); 58 59 /// Mark the \p Child node with a corresponding \p Role. All marked children 60 /// should be consumed by foldNode. 61 /// (!) when called on expressions (clang::Expr is derived from clang::Stmt), 62 /// wraps expressions into expression statement. 63 void markStmtChild(Stmt *Child, NodeRole Role); 64 /// Should be called for expressions in non-statement position to avoid 65 /// wrapping into expression statement. 66 void markExprChild(Expr *Child, NodeRole Role); 67 68 /// Set role for a token starting at \p Loc. 69 void markChildToken(SourceLocation Loc, tok::TokenKind Kind, NodeRole R); 70 71 /// Finish building the tree and consume the root node. 72 syntax::TranslationUnit *finalize() && { 73 auto Tokens = Arena.tokenBuffer().expandedTokens(); 74 assert(!Tokens.empty()); 75 assert(Tokens.back().kind() == tok::eof); 76 77 // Build the root of the tree, consuming all the children. 78 Pending.foldChildren(Tokens.drop_back(), 79 new (Arena.allocator()) syntax::TranslationUnit); 80 81 return cast<syntax::TranslationUnit>(std::move(Pending).finalize()); 82 } 83 84 /// getRange() finds the syntax tokens corresponding to the passed source 85 /// locations. 86 /// \p First is the start position of the first token and \p Last is the start 87 /// position of the last token. 88 llvm::ArrayRef<syntax::Token> getRange(SourceLocation First, 89 SourceLocation Last) const { 90 assert(First.isValid()); 91 assert(Last.isValid()); 92 assert(First == Last || 93 Arena.sourceManager().isBeforeInTranslationUnit(First, Last)); 94 return llvm::makeArrayRef(findToken(First), std::next(findToken(Last))); 95 } 96 llvm::ArrayRef<syntax::Token> getRange(const Decl *D) const { 97 return getRange(D->getBeginLoc(), D->getEndLoc()); 98 } 99 llvm::ArrayRef<syntax::Token> getExprRange(const Expr *E) const { 100 return getRange(E->getBeginLoc(), E->getEndLoc()); 101 } 102 /// Find the adjusted range for the statement, consuming the trailing 103 /// semicolon when needed. 104 llvm::ArrayRef<syntax::Token> getStmtRange(const Stmt *S) const { 105 auto Tokens = getRange(S->getBeginLoc(), S->getEndLoc()); 106 if (isa<CompoundStmt>(S)) 107 return Tokens; 108 109 // Some statements miss a trailing semicolon, e.g. 'return', 'continue' and 110 // all statements that end with those. Consume this semicolon here. 111 // 112 // (!) statements never consume 'eof', so looking at the next token is ok. 113 if (Tokens.back().kind() != tok::semi && Tokens.end()->kind() == tok::semi) 114 return llvm::makeArrayRef(Tokens.begin(), Tokens.end() + 1); 115 return Tokens; 116 } 117 118 private: 119 /// Finds a token starting at \p L. The token must exist. 120 const syntax::Token *findToken(SourceLocation L) const; 121 122 /// A collection of trees covering the input tokens. 123 /// When created, each tree corresponds to a single token in the file. 124 /// Clients call 'foldChildren' to attach one or more subtrees to a parent 125 /// node and update the list of trees accordingly. 126 /// 127 /// Ensures that added nodes properly nest and cover the whole token stream. 128 struct Forest { 129 Forest(syntax::Arena &A) { 130 assert(!A.tokenBuffer().expandedTokens().empty()); 131 assert(A.tokenBuffer().expandedTokens().back().kind() == tok::eof); 132 // Create all leaf nodes. 133 // Note that we do not have 'eof' in the tree. 134 for (auto &T : A.tokenBuffer().expandedTokens().drop_back()) 135 Trees.insert(Trees.end(), 136 {&T, NodeAndRole{new (A.allocator()) syntax::Leaf(&T)}}); 137 } 138 139 void assignRole(llvm::ArrayRef<syntax::Token> Range, 140 syntax::NodeRole Role) { 141 assert(!Range.empty()); 142 auto It = Trees.lower_bound(Range.begin()); 143 assert(It != Trees.end() && "no node found"); 144 assert(It->first == Range.begin() && "no child with the specified range"); 145 assert((std::next(It) == Trees.end() || 146 std::next(It)->first == Range.end()) && 147 "no child with the specified range"); 148 It->second.Role = Role; 149 } 150 151 /// Add \p Node to the forest and fill its children nodes based on the \p 152 /// NodeRange. 153 void foldChildren(llvm::ArrayRef<syntax::Token> NodeTokens, 154 syntax::Tree *Node) { 155 assert(!NodeTokens.empty()); 156 assert(Node->firstChild() == nullptr && "node already has children"); 157 158 auto *FirstToken = NodeTokens.begin(); 159 auto BeginChildren = Trees.lower_bound(FirstToken); 160 assert(BeginChildren != Trees.end() && 161 BeginChildren->first == FirstToken && 162 "fold crosses boundaries of existing subtrees"); 163 auto EndChildren = Trees.lower_bound(NodeTokens.end()); 164 assert((EndChildren == Trees.end() || 165 EndChildren->first == NodeTokens.end()) && 166 "fold crosses boundaries of existing subtrees"); 167 168 // (!) we need to go in reverse order, because we can only prepend. 169 for (auto It = EndChildren; It != BeginChildren; --It) 170 Node->prependChildLowLevel(std::prev(It)->second.Node, 171 std::prev(It)->second.Role); 172 173 Trees.erase(BeginChildren, EndChildren); 174 Trees.insert({FirstToken, NodeAndRole(Node)}); 175 } 176 177 // EXPECTS: all tokens were consumed and are owned by a single root node. 178 syntax::Node *finalize() && { 179 assert(Trees.size() == 1); 180 auto *Root = Trees.begin()->second.Node; 181 Trees = {}; 182 return Root; 183 } 184 185 std::string str(const syntax::Arena &A) const { 186 std::string R; 187 for (auto It = Trees.begin(); It != Trees.end(); ++It) { 188 unsigned CoveredTokens = 189 It != Trees.end() 190 ? (std::next(It)->first - It->first) 191 : A.tokenBuffer().expandedTokens().end() - It->first; 192 193 R += llvm::formatv("- '{0}' covers '{1}'+{2} tokens\n", 194 It->second.Node->kind(), 195 It->first->text(A.sourceManager()), CoveredTokens); 196 R += It->second.Node->dump(A); 197 } 198 return R; 199 } 200 201 private: 202 /// A with a role that should be assigned to it when adding to a parent. 203 struct NodeAndRole { 204 explicit NodeAndRole(syntax::Node *Node) 205 : Node(Node), Role(NodeRole::Unknown) {} 206 207 syntax::Node *Node; 208 NodeRole Role; 209 }; 210 211 /// Maps from the start token to a subtree starting at that token. 212 /// FIXME: storing the end tokens is redundant. 213 /// FIXME: the key of a map is redundant, it is also stored in NodeForRange. 214 std::map<const syntax::Token *, NodeAndRole> Trees; 215 }; 216 217 /// For debugging purposes. 218 std::string str() { return Pending.str(Arena); } 219 220 syntax::Arena &Arena; 221 Forest Pending; 222 }; 223 224 namespace { 225 class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> { 226 public: 227 explicit BuildTreeVisitor(ASTContext &Ctx, syntax::TreeBuilder &Builder) 228 : Builder(Builder), LangOpts(Ctx.getLangOpts()) {} 229 230 bool shouldTraversePostOrder() const { return true; } 231 232 bool TraverseDecl(Decl *D) { 233 if (!D || isa<TranslationUnitDecl>(D)) 234 return RecursiveASTVisitor::TraverseDecl(D); 235 if (!llvm::isa<TranslationUnitDecl>(D->getDeclContext())) 236 return true; // Only build top-level decls for now, do not recurse. 237 return RecursiveASTVisitor::TraverseDecl(D); 238 } 239 240 bool VisitDecl(Decl *D) { 241 assert(llvm::isa<TranslationUnitDecl>(D->getDeclContext()) && 242 "expected a top-level decl"); 243 assert(!D->isImplicit()); 244 Builder.foldNode(Builder.getRange(D), 245 new (allocator()) syntax::TopLevelDeclaration()); 246 return true; 247 } 248 249 bool WalkUpFromTranslationUnitDecl(TranslationUnitDecl *TU) { 250 // (!) we do not want to call VisitDecl(), the declaration for translation 251 // unit is built by finalize(). 252 return true; 253 } 254 255 bool WalkUpFromCompoundStmt(CompoundStmt *S) { 256 using NodeRole = syntax::NodeRole; 257 258 Builder.markChildToken(S->getLBracLoc(), tok::l_brace, NodeRole::OpenParen); 259 for (auto *Child : S->body()) 260 Builder.markStmtChild(Child, NodeRole::CompoundStatement_statement); 261 Builder.markChildToken(S->getRBracLoc(), tok::r_brace, 262 NodeRole::CloseParen); 263 264 Builder.foldNode(Builder.getStmtRange(S), 265 new (allocator()) syntax::CompoundStatement); 266 return true; 267 } 268 269 // Some statements are not yet handled by syntax trees. 270 bool WalkUpFromStmt(Stmt *S) { 271 Builder.foldNode(Builder.getStmtRange(S), 272 new (allocator()) syntax::UnknownStatement); 273 return true; 274 } 275 276 bool TraverseCXXForRangeStmt(CXXForRangeStmt *S) { 277 // We override to traverse range initializer as VarDecl. 278 // RAV traverses it as a statement, we produce invalid node kinds in that 279 // case. 280 // FIXME: should do this in RAV instead? 281 if (S->getInit() && !TraverseStmt(S->getInit())) 282 return false; 283 if (S->getLoopVariable() && !TraverseDecl(S->getLoopVariable())) 284 return false; 285 if (S->getRangeInit() && !TraverseStmt(S->getRangeInit())) 286 return false; 287 if (S->getBody() && !TraverseStmt(S->getBody())) 288 return false; 289 return true; 290 } 291 292 bool TraverseStmt(Stmt *S) { 293 if (auto *E = llvm::dyn_cast_or_null<Expr>(S)) { 294 // (!) do not recurse into subexpressions. 295 // we do not have syntax trees for expressions yet, so we only want to see 296 // the first top-level expression. 297 return WalkUpFromExpr(E->IgnoreImplicit()); 298 } 299 return RecursiveASTVisitor::TraverseStmt(S); 300 } 301 302 // Some expressions are not yet handled by syntax trees. 303 bool WalkUpFromExpr(Expr *E) { 304 assert(!isImplicitExpr(E) && "should be handled by TraverseStmt"); 305 Builder.foldNode(Builder.getExprRange(E), 306 new (allocator()) syntax::UnknownExpression); 307 return true; 308 } 309 310 // The code below is very regular, it could even be generated with some 311 // preprocessor magic. We merely assign roles to the corresponding children 312 // and fold resulting nodes. 313 bool WalkUpFromDeclStmt(DeclStmt *S) { 314 Builder.foldNode(Builder.getStmtRange(S), 315 new (allocator()) syntax::DeclarationStatement); 316 return true; 317 } 318 319 bool WalkUpFromNullStmt(NullStmt *S) { 320 Builder.foldNode(Builder.getStmtRange(S), 321 new (allocator()) syntax::EmptyStatement); 322 return true; 323 } 324 325 bool WalkUpFromSwitchStmt(SwitchStmt *S) { 326 Builder.markChildToken(S->getSwitchLoc(), tok::kw_switch, 327 syntax::NodeRole::IntroducerKeyword); 328 Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement); 329 Builder.foldNode(Builder.getStmtRange(S), 330 new (allocator()) syntax::SwitchStatement); 331 return true; 332 } 333 334 bool WalkUpFromCaseStmt(CaseStmt *S) { 335 Builder.markChildToken(S->getKeywordLoc(), tok::kw_case, 336 syntax::NodeRole::IntroducerKeyword); 337 Builder.markExprChild(S->getLHS(), syntax::NodeRole::CaseStatement_value); 338 Builder.markStmtChild(S->getSubStmt(), syntax::NodeRole::BodyStatement); 339 Builder.foldNode(Builder.getStmtRange(S), 340 new (allocator()) syntax::CaseStatement); 341 return true; 342 } 343 344 bool WalkUpFromDefaultStmt(DefaultStmt *S) { 345 Builder.markChildToken(S->getKeywordLoc(), tok::kw_default, 346 syntax::NodeRole::IntroducerKeyword); 347 Builder.markStmtChild(S->getSubStmt(), syntax::NodeRole::BodyStatement); 348 Builder.foldNode(Builder.getStmtRange(S), 349 new (allocator()) syntax::DefaultStatement); 350 return true; 351 } 352 353 bool WalkUpFromIfStmt(IfStmt *S) { 354 Builder.markChildToken(S->getIfLoc(), tok::kw_if, 355 syntax::NodeRole::IntroducerKeyword); 356 Builder.markStmtChild(S->getThen(), 357 syntax::NodeRole::IfStatement_thenStatement); 358 Builder.markChildToken(S->getElseLoc(), tok::kw_else, 359 syntax::NodeRole::IfStatement_elseKeyword); 360 Builder.markStmtChild(S->getElse(), 361 syntax::NodeRole::IfStatement_elseStatement); 362 Builder.foldNode(Builder.getStmtRange(S), 363 new (allocator()) syntax::IfStatement); 364 return true; 365 } 366 367 bool WalkUpFromForStmt(ForStmt *S) { 368 Builder.markChildToken(S->getForLoc(), tok::kw_for, 369 syntax::NodeRole::IntroducerKeyword); 370 Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement); 371 Builder.foldNode(Builder.getStmtRange(S), 372 new (allocator()) syntax::ForStatement); 373 return true; 374 } 375 376 bool WalkUpFromWhileStmt(WhileStmt *S) { 377 Builder.markChildToken(S->getWhileLoc(), tok::kw_while, 378 syntax::NodeRole::IntroducerKeyword); 379 Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement); 380 Builder.foldNode(Builder.getStmtRange(S), 381 new (allocator()) syntax::WhileStatement); 382 return true; 383 } 384 385 bool WalkUpFromContinueStmt(ContinueStmt *S) { 386 Builder.markChildToken(S->getContinueLoc(), tok::kw_continue, 387 syntax::NodeRole::IntroducerKeyword); 388 Builder.foldNode(Builder.getStmtRange(S), 389 new (allocator()) syntax::ContinueStatement); 390 return true; 391 } 392 393 bool WalkUpFromBreakStmt(BreakStmt *S) { 394 Builder.markChildToken(S->getBreakLoc(), tok::kw_break, 395 syntax::NodeRole::IntroducerKeyword); 396 Builder.foldNode(Builder.getStmtRange(S), 397 new (allocator()) syntax::BreakStatement); 398 return true; 399 } 400 401 bool WalkUpFromReturnStmt(ReturnStmt *S) { 402 Builder.markChildToken(S->getReturnLoc(), tok::kw_return, 403 syntax::NodeRole::IntroducerKeyword); 404 Builder.markExprChild(S->getRetValue(), 405 syntax::NodeRole::ReturnStatement_value); 406 Builder.foldNode(Builder.getStmtRange(S), 407 new (allocator()) syntax::ReturnStatement); 408 return true; 409 } 410 411 bool WalkUpFromCXXForRangeStmt(CXXForRangeStmt *S) { 412 Builder.markChildToken(S->getForLoc(), tok::kw_for, 413 syntax::NodeRole::IntroducerKeyword); 414 Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement); 415 Builder.foldNode(Builder.getStmtRange(S), 416 new (allocator()) syntax::RangeBasedForStatement); 417 return true; 418 } 419 420 private: 421 /// A small helper to save some typing. 422 llvm::BumpPtrAllocator &allocator() { return Builder.allocator(); } 423 424 syntax::TreeBuilder &Builder; 425 const LangOptions &LangOpts; 426 }; 427 } // namespace 428 429 void syntax::TreeBuilder::foldNode(llvm::ArrayRef<syntax::Token> Range, 430 syntax::Tree *New) { 431 Pending.foldChildren(Range, New); 432 } 433 434 void syntax::TreeBuilder::markChildToken(SourceLocation Loc, 435 tok::TokenKind Kind, NodeRole Role) { 436 if (Loc.isInvalid()) 437 return; 438 Pending.assignRole(*findToken(Loc), Role); 439 } 440 441 void syntax::TreeBuilder::markStmtChild(Stmt *Child, NodeRole Role) { 442 if (!Child) 443 return; 444 445 auto Range = getStmtRange(Child); 446 // This is an expression in a statement position, consume the trailing 447 // semicolon and form an 'ExpressionStatement' node. 448 if (auto *E = dyn_cast<Expr>(Child)) { 449 Pending.assignRole(getExprRange(E), 450 NodeRole::ExpressionStatement_expression); 451 // (!) 'getRange(Stmt)' ensures this already covers a trailing semicolon. 452 Pending.foldChildren(Range, new (allocator()) syntax::ExpressionStatement); 453 } 454 Pending.assignRole(Range, Role); 455 } 456 457 void syntax::TreeBuilder::markExprChild(Expr *Child, NodeRole Role) { 458 Pending.assignRole(getExprRange(Child), Role); 459 } 460 461 const syntax::Token *syntax::TreeBuilder::findToken(SourceLocation L) const { 462 auto Tokens = Arena.tokenBuffer().expandedTokens(); 463 auto &SM = Arena.sourceManager(); 464 auto It = llvm::partition_point(Tokens, [&](const syntax::Token &T) { 465 return SM.isBeforeInTranslationUnit(T.location(), L); 466 }); 467 assert(It != Tokens.end()); 468 assert(It->location() == L); 469 return &*It; 470 } 471 472 syntax::TranslationUnit * 473 syntax::buildSyntaxTree(Arena &A, const TranslationUnitDecl &TU) { 474 TreeBuilder Builder(A); 475 BuildTreeVisitor(TU.getASTContext(), Builder).TraverseAST(TU.getASTContext()); 476 return std::move(Builder).finalize(); 477 } 478