1#!/usr/bin/env python3
2
3"""
4This script reads the input from stdin, extracts all lines starting with
5"# FDATA: " (or a given prefix instead of "FDATA"), parses the directives,
6replaces symbol names ("#name#") with either symbol values or with offsets from
7respective anchor symbols, and prints the resulting file to stdout.
8"""
9
10import argparse
11import subprocess
12import sys
13import re
14
15parser = argparse.ArgumentParser()
16parser.add_argument("input")
17parser.add_argument("objfile", help="Object file to extract symbol values from")
18parser.add_argument("output")
19parser.add_argument("prefix", nargs="?", default="FDATA", help="Custom FDATA prefix")
20parser.add_argument("--nmtool", default="nm", help="Path to nm tool")
21
22args = parser.parse_args()
23
24# Regexes to extract FDATA lines from input and parse FDATA and pre-aggregated
25# profile data
26prefix_pat = re.compile(f"^# {args.prefix}: (.*)")
27
28# FDATA records:
29# <is symbol?> <closest elf symbol or DSO name> <relative FROM address>
30# <is symbol?> <closest elf symbol or DSO name> <relative TO address>
31# <number of mispredictions> <number of branches>
32fdata_pat = re.compile(r"([01].*) (?P<exec>\d+) (?P<mispred>\d+)")
33
34# Pre-aggregated profile:
35# {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
36# [<mispred_count>]
37preagg_pat = re.compile(r"(?P<type>[BFf]) (?P<offsets_count>.*)")
38
39# Replacement symbol: #symname#
40replace_pat = re.compile(r"#(?P<symname>[^#]+)#")
41
42# Read input and construct the representation of fdata expressions
43# as (src_tuple, dst_tuple, mispred_count, exec_count) tuples, where src and dst
44# are represented as (is_sym, anchor, offset) tuples
45exprs = []
46with open(args.input, 'r') as f:
47    for line in f.readlines():
48        prefix_match = prefix_pat.match(line)
49        if not prefix_match:
50            continue
51        profile_line = prefix_match.group(1)
52        fdata_match = fdata_pat.match(profile_line)
53        preagg_match = preagg_pat.match(profile_line)
54        if fdata_match:
55            src_dst, execnt, mispred = fdata_match.groups()
56            # Split by whitespaces not preceded by a backslash (negative lookbehind)
57            chunks = re.split(r'(?<!\\) +', src_dst)
58            # Check if the number of records separated by non-escaped whitespace
59            # exactly matches the format.
60            assert len(chunks) == 6, f"ERROR: wrong format/whitespaces must be escaped:\n{line}"
61            exprs.append(('FDATA', (*chunks, execnt, mispred)))
62        elif preagg_match:
63            exprs.append(('PREAGG', preagg_match.groups()))
64        else:
65            exit("ERROR: unexpected input:\n%s" % line)
66
67# Read nm output: <symbol value> <symbol type> <symbol name>
68nm_output = subprocess.run([args.nmtool, '--defined-only', args.objfile],
69                           text = True, capture_output = True).stdout
70# Populate symbol map
71symbols = {}
72for symline in nm_output.splitlines():
73    symval, _, symname = symline.split(maxsplit=2)
74    symbols[symname] = symval
75
76def evaluate_symbol(issym, anchor, offsym):
77    sym_match = replace_pat.match(offsym)
78    if not sym_match:
79        # No need to evaluate symbol value, return as is
80        return f'{issym} {anchor} {offsym}'
81    symname = sym_match.group('symname')
82    assert symname in symbols, f"ERROR: symbol {symname} is not defined in binary"
83    # Evaluate to an absolute offset if issym is false
84    if issym == '0':
85        return f'{issym} {anchor} {symbols[symname]}'
86    # Evaluate symbol against its anchor if issym is true
87    assert anchor in symbols, f"ERROR: symbol {anchor} is not defined in binary"
88    anchor_value = int(symbols[anchor], 16)
89    symbol_value = int(symbols[symname], 16)
90    sym_offset = symbol_value - anchor_value
91    return f'{issym} {anchor} {format(sym_offset, "x")}'
92
93def replace_symbol(matchobj):
94    '''
95    Expects matchobj to only capture one group which contains the symbol name.
96    '''
97    symname = matchobj.group('symname')
98    assert symname in symbols, f"ERROR: symbol {symname} is not defined in binary"
99    return symbols[symname]
100
101with open(args.output, 'w', newline='\n') as f:
102    for etype, expr in exprs:
103        if etype == 'FDATA':
104            issym1, anchor1, offsym1, issym2, anchor2, offsym2, execnt, mispred = expr
105            print(evaluate_symbol(issym1, anchor1, offsym1),
106                  evaluate_symbol(issym2, anchor2, offsym2),
107                  execnt, mispred, file = f)
108        elif etype == 'PREAGG':
109            # Replace all symbols enclosed in ##
110            print(expr[0], re.sub(replace_pat, replace_symbol, expr[1]),
111                  file = f)
112        else:
113            exit("ERROR: unhandled expression type:\n%s" % etype)
114