Skip to content

Instantly share code, notes, and snippets.

@mjbommar
Created April 18, 2022 19:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjbommar/b73d996fa83f5f9f781cef88b1e33fb2 to your computer and use it in GitHub Desktop.
Save mjbommar/b73d996fa83f5f9f781cef88b1e33fb2 to your computer and use it in GitHub Desktop.
# Copyright: Licensio, LLC 2022
# License: AGPL-3.0
import argparse
import multiprocessing
import os
import subprocess
import pandas
def get_file_list(path: str, recursive: bool = False):
"""
Return a list of all files in a directory.
:param path: path to the directory to get the files from
:param recursive: if true, recursively get all files in the directory
:return: a generator with file paths
"""
# switch on recursive arg
if args.recursive:
for directory_path, _, file_list in os.walk(args.path):
for file_name in file_list:
if os.path.isfile(os.path.join(directory_path, file_name)):
yield os.path.join(directory_path, file_name)
else:
for p in os.listdir(args.path):
if os.path.isfile(os.path.join(args.path, p)):
yield os.path.join(args.path, p)
def run_nm(file_path: str):
"""
Run the nm command on a file.
:param file_path: path to the file to run the command on.
:return: a list of dict entries containing the symbol info
"""
# run the command
p = subprocess.Popen(
["nm", "-a", "-A", "--special-syms", "--synthetic", file_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
# get the output
stdout_buffer, stderr_buffer = p.communicate()
stdout_buffer = stdout_buffer.decode("utf-8")
stderr_buffer = stderr_buffer.decode("utf-8")
symbol_data = []
for line in stdout_buffer.splitlines():
line_tokens = line.split(":")
line_symbol_tokens = line_tokens[-1].split()
if len(line_tokens) == 2:
if len(line_symbol_tokens) == 2:
symbol_data.append(
{
"path": line_tokens[0],
"object": None,
"symbol_type": line_symbol_tokens[0],
"symbol_name": line_symbol_tokens[1],
}
)
else:
symbol_data.append(
{
"path": line_tokens[0],
"object": None,
"symbol_type": line_symbol_tokens[1],
"symbol_name": line_symbol_tokens[2],
}
)
elif len(line_tokens) == 3:
if len(line_symbol_tokens) == 2:
symbol_data.append(
{
"path": line_tokens[0],
"object": line_tokens[1],
"symbol_type": line_symbol_tokens[0],
"symbol_name": line_symbol_tokens[1],
}
)
else:
symbol_data.append(
{
"path": line_tokens[0],
"object": line_tokens[1],
"symbol_type": line_symbol_tokens[1],
"symbol_name": line_symbol_tokens[2],
}
)
else:
print(len(line_tokens))
return symbol_data
if __name__ == "__main__":
# parse CLI arguments
parser = argparse.ArgumentParser(description="List local symbols in binaries")
parser.add_argument("path", help="Path to begin search", type=str, default=".")
parser.add_argument(
"--recursive",
help="Whether to search recursively",
action="store_true",
default=False,
)
parser.add_argument("--output-path", help="Path to output CSV file", type=str)
parser.add_argument(
"--output-format", help="Output format: {csv, json}", type=str, default="csv"
)
args = parser.parse_args()
# setup and execute a pool of nm processes
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
pool_results = pool.map(run_nm, get_file_list(args.path, args.recursive))
# reduce map results into single list
symbol_data = []
for file_result in pool_results:
if len(file_result) > 0:
symbol_data.extend(file_result)
# store results
symbol_df = pandas.DataFrame(symbol_data)
if args.output_path:
if args.output_format == "csv":
symbol_df.to_csv(args.output_path, encoding="utf-8")
elif args.output_format == "json":
symbol_df.to_json(args.output_path)
else:
print("Error: Invalid output format: {csv, json}")
else:
if args.output_format == "csv":
symbol_df.to_csv("symbols.csv", encoding="utf-8")
elif args.output_format == "json":
symbol_df.to_json("symbols.json")
# print top 10
print(f"identified {symbol_df.shape[0]} symbols")
if symbol_df.shape[0] > 0:
print("top 5 files by symbol count:")
print(symbol_df["path"].value_counts().head())
print("top 5 symbols by count:")
print(symbol_df["symbol_name"].value_counts().head())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment