1*46a6f5aeSKirill Bobyrev#!/usr/bin/env python
2*46a6f5aeSKirill Bobyrev#===- cppreference_parser.py -  ------------------------------*- python -*--===#
3*46a6f5aeSKirill Bobyrev#
4*46a6f5aeSKirill Bobyrev# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*46a6f5aeSKirill Bobyrev# See https://llvm.org/LICENSE.txt for license information.
6*46a6f5aeSKirill Bobyrev# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*46a6f5aeSKirill Bobyrev#
8*46a6f5aeSKirill Bobyrev#===------------------------------------------------------------------------===#
9*46a6f5aeSKirill Bobyrev
10*46a6f5aeSKirill Bobyrevfrom bs4 import BeautifulSoup, NavigableString
11*46a6f5aeSKirill Bobyrev
12*46a6f5aeSKirill Bobyrevimport collections
13*46a6f5aeSKirill Bobyrevimport multiprocessing
14*46a6f5aeSKirill Bobyrevimport os
15*46a6f5aeSKirill Bobyrevimport re
16*46a6f5aeSKirill Bobyrevimport signal
17*46a6f5aeSKirill Bobyrevimport sys
18*46a6f5aeSKirill Bobyrev
19*46a6f5aeSKirill Bobyrev
20*46a6f5aeSKirill Bobyrevclass Symbol:
21*46a6f5aeSKirill Bobyrev
22*46a6f5aeSKirill Bobyrev  def __init__(self, name, namespace, headers):
23*46a6f5aeSKirill Bobyrev    # unqualifed symbol name, e.g. "move"
24*46a6f5aeSKirill Bobyrev    self.name = name
25*46a6f5aeSKirill Bobyrev    # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
26*46a6f5aeSKirill Bobyrev    # None for C symbols.
27*46a6f5aeSKirill Bobyrev    self.namespace = namespace
28*46a6f5aeSKirill Bobyrev    # a list of corresponding headers
29*46a6f5aeSKirill Bobyrev    self.headers = headers
30*46a6f5aeSKirill Bobyrev
31*46a6f5aeSKirill Bobyrev
32*46a6f5aeSKirill Bobyrevdef _HasClass(tag, *classes):
33*46a6f5aeSKirill Bobyrev  for c in tag.get('class', []):
34*46a6f5aeSKirill Bobyrev    if c in classes:
35*46a6f5aeSKirill Bobyrev      return True
36*46a6f5aeSKirill Bobyrev  return False
37*46a6f5aeSKirill Bobyrev
38*46a6f5aeSKirill Bobyrev
39*46a6f5aeSKirill Bobyrevdef _ParseSymbolPage(symbol_page_html, symbol_name):
40*46a6f5aeSKirill Bobyrev  """Parse symbol page and retrieve the include header defined in this page.
41*46a6f5aeSKirill Bobyrev  The symbol page provides header for the symbol, specifically in
42*46a6f5aeSKirill Bobyrev  "Defined in header <header>" section. An example:
43*46a6f5aeSKirill Bobyrev
44*46a6f5aeSKirill Bobyrev  <tr class="t-dsc-header">
45*46a6f5aeSKirill Bobyrev    <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
46*46a6f5aeSKirill Bobyrev  </td></tr>
47*46a6f5aeSKirill Bobyrev
48*46a6f5aeSKirill Bobyrev  Returns a list of headers.
49*46a6f5aeSKirill Bobyrev  """
50*46a6f5aeSKirill Bobyrev  headers = set()
51*46a6f5aeSKirill Bobyrev  all_headers = set()
52*46a6f5aeSKirill Bobyrev
53*46a6f5aeSKirill Bobyrev  soup = BeautifulSoup(symbol_page_html, "html.parser")
54*46a6f5aeSKirill Bobyrev  # Rows in table are like:
55*46a6f5aeSKirill Bobyrev  #   Defined in header <foo>      .t-dsc-header
56*46a6f5aeSKirill Bobyrev  #   Defined in header <bar>      .t-dsc-header
57*46a6f5aeSKirill Bobyrev  #   decl1                        .t-dcl
58*46a6f5aeSKirill Bobyrev  #   Defined in header <baz>      .t-dsc-header
59*46a6f5aeSKirill Bobyrev  #   decl2                        .t-dcl
60*46a6f5aeSKirill Bobyrev  for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
61*46a6f5aeSKirill Bobyrev    current_headers = []
62*46a6f5aeSKirill Bobyrev    was_decl = False
63*46a6f5aeSKirill Bobyrev    for row in table.select('tr'):
64*46a6f5aeSKirill Bobyrev      if _HasClass(row, 't-dcl', 't-dsc'):
65*46a6f5aeSKirill Bobyrev        was_decl = True
66*46a6f5aeSKirill Bobyrev        # Symbols are in the first cell.
67*46a6f5aeSKirill Bobyrev        found_symbols = row.find('td').stripped_strings
68*46a6f5aeSKirill Bobyrev        if not symbol_name in found_symbols:
69*46a6f5aeSKirill Bobyrev          continue
70*46a6f5aeSKirill Bobyrev        headers.update(current_headers)
71*46a6f5aeSKirill Bobyrev      elif _HasClass(row, 't-dsc-header'):
72*46a6f5aeSKirill Bobyrev        # If we saw a decl since the last header, this is a new block of headers
73*46a6f5aeSKirill Bobyrev        # for a new block of decls.
74*46a6f5aeSKirill Bobyrev        if was_decl:
75*46a6f5aeSKirill Bobyrev          current_headers = []
76*46a6f5aeSKirill Bobyrev        was_decl = False
77*46a6f5aeSKirill Bobyrev        # There are also .t-dsc-header for "defined in namespace".
78*46a6f5aeSKirill Bobyrev        if not "Defined in header " in row.text:
79*46a6f5aeSKirill Bobyrev          continue
80*46a6f5aeSKirill Bobyrev        # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
81*46a6f5aeSKirill Bobyrev        for header_code in row.find_all("code"):
82*46a6f5aeSKirill Bobyrev          current_headers.append(header_code.text)
83*46a6f5aeSKirill Bobyrev          all_headers.add(header_code.text)
84*46a6f5aeSKirill Bobyrev  # If the symbol was never named, consider all named headers.
85*46a6f5aeSKirill Bobyrev  return headers or all_headers
86*46a6f5aeSKirill Bobyrev
87*46a6f5aeSKirill Bobyrev
88*46a6f5aeSKirill Bobyrevdef _ParseIndexPage(index_page_html):
89*46a6f5aeSKirill Bobyrev  """Parse index page.
90*46a6f5aeSKirill Bobyrev  The index page lists all std symbols and hrefs to their detailed pages
91*46a6f5aeSKirill Bobyrev  (which contain the defined header). An example:
92*46a6f5aeSKirill Bobyrev
93*46a6f5aeSKirill Bobyrev  <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
94*46a6f5aeSKirill Bobyrev  <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
95*46a6f5aeSKirill Bobyrev
96*46a6f5aeSKirill Bobyrev  Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
97*46a6f5aeSKirill Bobyrev  """
98*46a6f5aeSKirill Bobyrev  symbols = []
99*46a6f5aeSKirill Bobyrev  soup = BeautifulSoup(index_page_html, "html.parser")
100*46a6f5aeSKirill Bobyrev  for symbol_href in soup.select("a[title]"):
101*46a6f5aeSKirill Bobyrev    # Ignore annotated symbols like "acos<>() (std::complex)".
102*46a6f5aeSKirill Bobyrev    # These tend to be overloads, and we the primary is more useful.
103*46a6f5aeSKirill Bobyrev    # This accidentally accepts begin/end despite the (iterator) caption: the
104*46a6f5aeSKirill Bobyrev    # (since C++11) note is first. They are good symbols, so the bug is unfixed.
105*46a6f5aeSKirill Bobyrev    caption = symbol_href.next_sibling
106*46a6f5aeSKirill Bobyrev    variant = None
107*46a6f5aeSKirill Bobyrev    if isinstance(caption, NavigableString) and "(" in caption:
108*46a6f5aeSKirill Bobyrev      variant = caption.text.strip(" ()")
109*46a6f5aeSKirill Bobyrev    symbol_tt = symbol_href.find("tt")
110*46a6f5aeSKirill Bobyrev    if symbol_tt:
111*46a6f5aeSKirill Bobyrev      symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
112*46a6f5aeSKirill Bobyrev                      symbol_href["href"], variant))
113*46a6f5aeSKirill Bobyrev  return symbols
114*46a6f5aeSKirill Bobyrev
115*46a6f5aeSKirill Bobyrev
116*46a6f5aeSKirill Bobyrevdef _ReadSymbolPage(path, name):
117*46a6f5aeSKirill Bobyrev  with open(path) as f:
118*46a6f5aeSKirill Bobyrev    return _ParseSymbolPage(f.read(), name)
119*46a6f5aeSKirill Bobyrev
120*46a6f5aeSKirill Bobyrev
121*46a6f5aeSKirill Bobyrevdef _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
122*46a6f5aeSKirill Bobyrev  """Get all symbols listed in the index page. All symbols should be in the
123*46a6f5aeSKirill Bobyrev  given namespace.
124*46a6f5aeSKirill Bobyrev
125*46a6f5aeSKirill Bobyrev  Returns a list of Symbols.
126*46a6f5aeSKirill Bobyrev  """
127*46a6f5aeSKirill Bobyrev
128*46a6f5aeSKirill Bobyrev  # Workflow steps:
129*46a6f5aeSKirill Bobyrev  #   1. Parse index page which lists all symbols to get symbol
130*46a6f5aeSKirill Bobyrev  #      name (unqualified name) and its href link to the symbol page which
131*46a6f5aeSKirill Bobyrev  #      contains the defined header.
132*46a6f5aeSKirill Bobyrev  #   2. Parse the symbol page to get the defined header.
133*46a6f5aeSKirill Bobyrev  index_page_path = os.path.join(root_dir, index_page_name)
134*46a6f5aeSKirill Bobyrev  with open(index_page_path, "r") as f:
135*46a6f5aeSKirill Bobyrev    # Read each symbol page in parallel.
136*46a6f5aeSKirill Bobyrev    results = [] # (symbol_name, promise of [header...])
137*46a6f5aeSKirill Bobyrev    for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
138*46a6f5aeSKirill Bobyrev      # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
139*46a6f5aeSKirill Bobyrev      # FIXME: use these as a fallback rather than ignoring entirely.
140*46a6f5aeSKirill Bobyrev      variants_for_symbol = variants_to_accept.get(
141*46a6f5aeSKirill Bobyrev          (namespace or "") + symbol_name, ())
142*46a6f5aeSKirill Bobyrev      if variant and variant not in variants_for_symbol:
143*46a6f5aeSKirill Bobyrev        continue
144*46a6f5aeSKirill Bobyrev      path = os.path.join(root_dir, symbol_page_path)
145*46a6f5aeSKirill Bobyrev      results.append((symbol_name,
146*46a6f5aeSKirill Bobyrev                      pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
147*46a6f5aeSKirill Bobyrev
148*46a6f5aeSKirill Bobyrev    # Build map from symbol name to a set of headers.
149*46a6f5aeSKirill Bobyrev    symbol_headers = collections.defaultdict(set)
150*46a6f5aeSKirill Bobyrev    for symbol_name, lazy_headers in results:
151*46a6f5aeSKirill Bobyrev      symbol_headers[symbol_name].update(lazy_headers.get())
152*46a6f5aeSKirill Bobyrev
153*46a6f5aeSKirill Bobyrev  symbols = []
154*46a6f5aeSKirill Bobyrev  for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
155*46a6f5aeSKirill Bobyrev    symbols.append(Symbol(name, namespace, list(headers)))
156*46a6f5aeSKirill Bobyrev  return symbols
157*46a6f5aeSKirill Bobyrev
158*46a6f5aeSKirill Bobyrev
159*46a6f5aeSKirill Bobyrevdef GetSymbols(parse_pages):
160*46a6f5aeSKirill Bobyrev  """Get all symbols by parsing the given pages.
161*46a6f5aeSKirill Bobyrev
162*46a6f5aeSKirill Bobyrev  Args:
163*46a6f5aeSKirill Bobyrev    parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
164*46a6f5aeSKirill Bobyrev  """
165*46a6f5aeSKirill Bobyrev  # By default we prefer the non-variant versions, as they're more common. But
166*46a6f5aeSKirill Bobyrev  # there are some symbols, whose variant is more common. This list describes
167*46a6f5aeSKirill Bobyrev  # those symbols.
168*46a6f5aeSKirill Bobyrev  variants_to_accept = {
169*46a6f5aeSKirill Bobyrev      # std::remove<> has variant algorithm.
170*46a6f5aeSKirill Bobyrev      "std::remove": ("algorithm"),
171*46a6f5aeSKirill Bobyrev  }
172*46a6f5aeSKirill Bobyrev  symbols = []
173*46a6f5aeSKirill Bobyrev  # Run many workers to process individual symbol pages under the symbol index.
174*46a6f5aeSKirill Bobyrev  # Don't allow workers to capture Ctrl-C.
175*46a6f5aeSKirill Bobyrev  pool = multiprocessing.Pool(
176*46a6f5aeSKirill Bobyrev      initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
177*46a6f5aeSKirill Bobyrev  try:
178*46a6f5aeSKirill Bobyrev    for root_dir, page_name, namespace in parse_pages:
179*46a6f5aeSKirill Bobyrev      symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace,
180*46a6f5aeSKirill Bobyrev                                 variants_to_accept))
181*46a6f5aeSKirill Bobyrev  finally:
182*46a6f5aeSKirill Bobyrev    pool.terminate()
183*46a6f5aeSKirill Bobyrev    pool.join()
184*46a6f5aeSKirill Bobyrev  return symbols
185