1*46a6f5aeSKirill Bobyrev#!/usr/bin/env python 2*46a6f5aeSKirill Bobyrev#===- cppreference_parser.py - ------------------------------*- python -*--===# 3*46a6f5aeSKirill Bobyrev# 4*46a6f5aeSKirill Bobyrev# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5*46a6f5aeSKirill Bobyrev# See https://llvm.org/LICENSE.txt for license information. 6*46a6f5aeSKirill Bobyrev# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7*46a6f5aeSKirill Bobyrev# 8*46a6f5aeSKirill Bobyrev#===------------------------------------------------------------------------===# 9*46a6f5aeSKirill Bobyrev 10*46a6f5aeSKirill Bobyrevfrom bs4 import BeautifulSoup, NavigableString 11*46a6f5aeSKirill Bobyrev 12*46a6f5aeSKirill Bobyrevimport collections 13*46a6f5aeSKirill Bobyrevimport multiprocessing 14*46a6f5aeSKirill Bobyrevimport os 15*46a6f5aeSKirill Bobyrevimport re 16*46a6f5aeSKirill Bobyrevimport signal 17*46a6f5aeSKirill Bobyrevimport sys 18*46a6f5aeSKirill Bobyrev 19*46a6f5aeSKirill Bobyrev 20*46a6f5aeSKirill Bobyrevclass Symbol: 21*46a6f5aeSKirill Bobyrev 22*46a6f5aeSKirill Bobyrev def __init__(self, name, namespace, headers): 23*46a6f5aeSKirill Bobyrev # unqualifed symbol name, e.g. "move" 24*46a6f5aeSKirill Bobyrev self.name = name 25*46a6f5aeSKirill Bobyrev # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) 26*46a6f5aeSKirill Bobyrev # None for C symbols. 27*46a6f5aeSKirill Bobyrev self.namespace = namespace 28*46a6f5aeSKirill Bobyrev # a list of corresponding headers 29*46a6f5aeSKirill Bobyrev self.headers = headers 30*46a6f5aeSKirill Bobyrev 31*46a6f5aeSKirill Bobyrev 32*46a6f5aeSKirill Bobyrevdef _HasClass(tag, *classes): 33*46a6f5aeSKirill Bobyrev for c in tag.get('class', []): 34*46a6f5aeSKirill Bobyrev if c in classes: 35*46a6f5aeSKirill Bobyrev return True 36*46a6f5aeSKirill Bobyrev return False 37*46a6f5aeSKirill Bobyrev 38*46a6f5aeSKirill Bobyrev 39*46a6f5aeSKirill Bobyrevdef _ParseSymbolPage(symbol_page_html, symbol_name): 40*46a6f5aeSKirill Bobyrev """Parse symbol page and retrieve the include header defined in this page. 41*46a6f5aeSKirill Bobyrev The symbol page provides header for the symbol, specifically in 42*46a6f5aeSKirill Bobyrev "Defined in header <header>" section. An example: 43*46a6f5aeSKirill Bobyrev 44*46a6f5aeSKirill Bobyrev <tr class="t-dsc-header"> 45*46a6f5aeSKirill Bobyrev <td colspan="2"> <div>Defined in header <code><ratio></code> </div> 46*46a6f5aeSKirill Bobyrev </td></tr> 47*46a6f5aeSKirill Bobyrev 48*46a6f5aeSKirill Bobyrev Returns a list of headers. 49*46a6f5aeSKirill Bobyrev """ 50*46a6f5aeSKirill Bobyrev headers = set() 51*46a6f5aeSKirill Bobyrev all_headers = set() 52*46a6f5aeSKirill Bobyrev 53*46a6f5aeSKirill Bobyrev soup = BeautifulSoup(symbol_page_html, "html.parser") 54*46a6f5aeSKirill Bobyrev # Rows in table are like: 55*46a6f5aeSKirill Bobyrev # Defined in header <foo> .t-dsc-header 56*46a6f5aeSKirill Bobyrev # Defined in header <bar> .t-dsc-header 57*46a6f5aeSKirill Bobyrev # decl1 .t-dcl 58*46a6f5aeSKirill Bobyrev # Defined in header <baz> .t-dsc-header 59*46a6f5aeSKirill Bobyrev # decl2 .t-dcl 60*46a6f5aeSKirill Bobyrev for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): 61*46a6f5aeSKirill Bobyrev current_headers = [] 62*46a6f5aeSKirill Bobyrev was_decl = False 63*46a6f5aeSKirill Bobyrev for row in table.select('tr'): 64*46a6f5aeSKirill Bobyrev if _HasClass(row, 't-dcl', 't-dsc'): 65*46a6f5aeSKirill Bobyrev was_decl = True 66*46a6f5aeSKirill Bobyrev # Symbols are in the first cell. 67*46a6f5aeSKirill Bobyrev found_symbols = row.find('td').stripped_strings 68*46a6f5aeSKirill Bobyrev if not symbol_name in found_symbols: 69*46a6f5aeSKirill Bobyrev continue 70*46a6f5aeSKirill Bobyrev headers.update(current_headers) 71*46a6f5aeSKirill Bobyrev elif _HasClass(row, 't-dsc-header'): 72*46a6f5aeSKirill Bobyrev # If we saw a decl since the last header, this is a new block of headers 73*46a6f5aeSKirill Bobyrev # for a new block of decls. 74*46a6f5aeSKirill Bobyrev if was_decl: 75*46a6f5aeSKirill Bobyrev current_headers = [] 76*46a6f5aeSKirill Bobyrev was_decl = False 77*46a6f5aeSKirill Bobyrev # There are also .t-dsc-header for "defined in namespace". 78*46a6f5aeSKirill Bobyrev if not "Defined in header " in row.text: 79*46a6f5aeSKirill Bobyrev continue 80*46a6f5aeSKirill Bobyrev # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. 81*46a6f5aeSKirill Bobyrev for header_code in row.find_all("code"): 82*46a6f5aeSKirill Bobyrev current_headers.append(header_code.text) 83*46a6f5aeSKirill Bobyrev all_headers.add(header_code.text) 84*46a6f5aeSKirill Bobyrev # If the symbol was never named, consider all named headers. 85*46a6f5aeSKirill Bobyrev return headers or all_headers 86*46a6f5aeSKirill Bobyrev 87*46a6f5aeSKirill Bobyrev 88*46a6f5aeSKirill Bobyrevdef _ParseIndexPage(index_page_html): 89*46a6f5aeSKirill Bobyrev """Parse index page. 90*46a6f5aeSKirill Bobyrev The index page lists all std symbols and hrefs to their detailed pages 91*46a6f5aeSKirill Bobyrev (which contain the defined header). An example: 92*46a6f5aeSKirill Bobyrev 93*46a6f5aeSKirill Bobyrev <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> 94*46a6f5aeSKirill Bobyrev <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> 95*46a6f5aeSKirill Bobyrev 96*46a6f5aeSKirill Bobyrev Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). 97*46a6f5aeSKirill Bobyrev """ 98*46a6f5aeSKirill Bobyrev symbols = [] 99*46a6f5aeSKirill Bobyrev soup = BeautifulSoup(index_page_html, "html.parser") 100*46a6f5aeSKirill Bobyrev for symbol_href in soup.select("a[title]"): 101*46a6f5aeSKirill Bobyrev # Ignore annotated symbols like "acos<>() (std::complex)". 102*46a6f5aeSKirill Bobyrev # These tend to be overloads, and we the primary is more useful. 103*46a6f5aeSKirill Bobyrev # This accidentally accepts begin/end despite the (iterator) caption: the 104*46a6f5aeSKirill Bobyrev # (since C++11) note is first. They are good symbols, so the bug is unfixed. 105*46a6f5aeSKirill Bobyrev caption = symbol_href.next_sibling 106*46a6f5aeSKirill Bobyrev variant = None 107*46a6f5aeSKirill Bobyrev if isinstance(caption, NavigableString) and "(" in caption: 108*46a6f5aeSKirill Bobyrev variant = caption.text.strip(" ()") 109*46a6f5aeSKirill Bobyrev symbol_tt = symbol_href.find("tt") 110*46a6f5aeSKirill Bobyrev if symbol_tt: 111*46a6f5aeSKirill Bobyrev symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() 112*46a6f5aeSKirill Bobyrev symbol_href["href"], variant)) 113*46a6f5aeSKirill Bobyrev return symbols 114*46a6f5aeSKirill Bobyrev 115*46a6f5aeSKirill Bobyrev 116*46a6f5aeSKirill Bobyrevdef _ReadSymbolPage(path, name): 117*46a6f5aeSKirill Bobyrev with open(path) as f: 118*46a6f5aeSKirill Bobyrev return _ParseSymbolPage(f.read(), name) 119*46a6f5aeSKirill Bobyrev 120*46a6f5aeSKirill Bobyrev 121*46a6f5aeSKirill Bobyrevdef _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept): 122*46a6f5aeSKirill Bobyrev """Get all symbols listed in the index page. All symbols should be in the 123*46a6f5aeSKirill Bobyrev given namespace. 124*46a6f5aeSKirill Bobyrev 125*46a6f5aeSKirill Bobyrev Returns a list of Symbols. 126*46a6f5aeSKirill Bobyrev """ 127*46a6f5aeSKirill Bobyrev 128*46a6f5aeSKirill Bobyrev # Workflow steps: 129*46a6f5aeSKirill Bobyrev # 1. Parse index page which lists all symbols to get symbol 130*46a6f5aeSKirill Bobyrev # name (unqualified name) and its href link to the symbol page which 131*46a6f5aeSKirill Bobyrev # contains the defined header. 132*46a6f5aeSKirill Bobyrev # 2. Parse the symbol page to get the defined header. 133*46a6f5aeSKirill Bobyrev index_page_path = os.path.join(root_dir, index_page_name) 134*46a6f5aeSKirill Bobyrev with open(index_page_path, "r") as f: 135*46a6f5aeSKirill Bobyrev # Read each symbol page in parallel. 136*46a6f5aeSKirill Bobyrev results = [] # (symbol_name, promise of [header...]) 137*46a6f5aeSKirill Bobyrev for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): 138*46a6f5aeSKirill Bobyrev # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. 139*46a6f5aeSKirill Bobyrev # FIXME: use these as a fallback rather than ignoring entirely. 140*46a6f5aeSKirill Bobyrev variants_for_symbol = variants_to_accept.get( 141*46a6f5aeSKirill Bobyrev (namespace or "") + symbol_name, ()) 142*46a6f5aeSKirill Bobyrev if variant and variant not in variants_for_symbol: 143*46a6f5aeSKirill Bobyrev continue 144*46a6f5aeSKirill Bobyrev path = os.path.join(root_dir, symbol_page_path) 145*46a6f5aeSKirill Bobyrev results.append((symbol_name, 146*46a6f5aeSKirill Bobyrev pool.apply_async(_ReadSymbolPage, (path, symbol_name)))) 147*46a6f5aeSKirill Bobyrev 148*46a6f5aeSKirill Bobyrev # Build map from symbol name to a set of headers. 149*46a6f5aeSKirill Bobyrev symbol_headers = collections.defaultdict(set) 150*46a6f5aeSKirill Bobyrev for symbol_name, lazy_headers in results: 151*46a6f5aeSKirill Bobyrev symbol_headers[symbol_name].update(lazy_headers.get()) 152*46a6f5aeSKirill Bobyrev 153*46a6f5aeSKirill Bobyrev symbols = [] 154*46a6f5aeSKirill Bobyrev for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): 155*46a6f5aeSKirill Bobyrev symbols.append(Symbol(name, namespace, list(headers))) 156*46a6f5aeSKirill Bobyrev return symbols 157*46a6f5aeSKirill Bobyrev 158*46a6f5aeSKirill Bobyrev 159*46a6f5aeSKirill Bobyrevdef GetSymbols(parse_pages): 160*46a6f5aeSKirill Bobyrev """Get all symbols by parsing the given pages. 161*46a6f5aeSKirill Bobyrev 162*46a6f5aeSKirill Bobyrev Args: 163*46a6f5aeSKirill Bobyrev parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) 164*46a6f5aeSKirill Bobyrev """ 165*46a6f5aeSKirill Bobyrev # By default we prefer the non-variant versions, as they're more common. But 166*46a6f5aeSKirill Bobyrev # there are some symbols, whose variant is more common. This list describes 167*46a6f5aeSKirill Bobyrev # those symbols. 168*46a6f5aeSKirill Bobyrev variants_to_accept = { 169*46a6f5aeSKirill Bobyrev # std::remove<> has variant algorithm. 170*46a6f5aeSKirill Bobyrev "std::remove": ("algorithm"), 171*46a6f5aeSKirill Bobyrev } 172*46a6f5aeSKirill Bobyrev symbols = [] 173*46a6f5aeSKirill Bobyrev # Run many workers to process individual symbol pages under the symbol index. 174*46a6f5aeSKirill Bobyrev # Don't allow workers to capture Ctrl-C. 175*46a6f5aeSKirill Bobyrev pool = multiprocessing.Pool( 176*46a6f5aeSKirill Bobyrev initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) 177*46a6f5aeSKirill Bobyrev try: 178*46a6f5aeSKirill Bobyrev for root_dir, page_name, namespace in parse_pages: 179*46a6f5aeSKirill Bobyrev symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace, 180*46a6f5aeSKirill Bobyrev variants_to_accept)) 181*46a6f5aeSKirill Bobyrev finally: 182*46a6f5aeSKirill Bobyrev pool.terminate() 183*46a6f5aeSKirill Bobyrev pool.join() 184*46a6f5aeSKirill Bobyrev return symbols 185