mjbommar/list_local_symbols.py

## list_local_symbols.py
# Copyright: Licensio, LLC 2022
# License: AGPL-3.0

import argparse
import multiprocessing
import os
import subprocess
import pandas


def get_file_list(path: str, recursive: bool = False):
    """
    Return a list of all files in a directory.
    :param path: path to the directory to get the files from
    :param recursive: if true, recursively get all files in the directory
    :return: a generator with file paths
    """
    # switch on recursive arg
    if args.recursive:
        for directory_path, _, file_list in os.walk(args.path):
            for file_name in file_list:
                if os.path.isfile(os.path.join(directory_path, file_name)):
                    yield os.path.join(directory_path, file_name)
    else:
        for p in os.listdir(args.path):
            if os.path.isfile(os.path.join(args.path, p)):
                yield os.path.join(args.path, p)


def run_nm(file_path: str):
    """
    Run the nm command on a file.
    :param file_path: path to the file to run the command on.
    :return: a list of dict entries containing the symbol info
    """
    # run the command
    p = subprocess.Popen(
        ["nm", "-a", "-A", "--special-syms", "--synthetic", file_path],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )

    # get the output
    stdout_buffer, stderr_buffer = p.communicate()
    stdout_buffer = stdout_buffer.decode("utf-8")
    stderr_buffer = stderr_buffer.decode("utf-8")

    symbol_data = []
    for line in stdout_buffer.splitlines():
        line_tokens = line.split(":")
        line_symbol_tokens = line_tokens[-1].split()
        if len(line_tokens) == 2:
            if len(line_symbol_tokens) == 2:
                symbol_data.append(
                    {
                        "path": line_tokens[0],
                        "object": None,
                        "symbol_type": line_symbol_tokens[0],
                        "symbol_name": line_symbol_tokens[1],
                    }
                )
            else:
                symbol_data.append(
                    {
                        "path": line_tokens[0],
                        "object": None,
                        "symbol_type": line_symbol_tokens[1],
                        "symbol_name": line_symbol_tokens[2],
                    }
                )
        elif len(line_tokens) == 3:
            if len(line_symbol_tokens) == 2:
                symbol_data.append(
                    {
                        "path": line_tokens[0],
                        "object": line_tokens[1],
                        "symbol_type": line_symbol_tokens[0],
                        "symbol_name": line_symbol_tokens[1],
                    }
                )
            else:
                symbol_data.append(
                    {
                        "path": line_tokens[0],
                        "object": line_tokens[1],
                        "symbol_type": line_symbol_tokens[1],
                        "symbol_name": line_symbol_tokens[2],
                    }
                )
        else:
            print(len(line_tokens))

    return symbol_data


if __name__ == "__main__":
    # parse CLI arguments
    parser = argparse.ArgumentParser(description="List local symbols in binaries")
    parser.add_argument("path", help="Path to begin search", type=str, default=".")
    parser.add_argument(
        "--recursive",
        help="Whether to search recursively",
        action="store_true",
        default=False,
    )
    parser.add_argument("--output-path", help="Path to output CSV file", type=str)
    parser.add_argument(
        "--output-format", help="Output format: {csv, json}", type=str, default="csv"
    )

    args = parser.parse_args()

    # setup and execute a pool of nm processes
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    pool_results = pool.map(run_nm, get_file_list(args.path, args.recursive))

    # reduce map results into single list
    symbol_data = []
    for file_result in pool_results:
        if len(file_result) > 0:
            symbol_data.extend(file_result)

    # store results
    symbol_df = pandas.DataFrame(symbol_data)

    if args.output_path:
        if args.output_format == "csv":
            symbol_df.to_csv(args.output_path, encoding="utf-8")
        elif args.output_format == "json":
            symbol_df.to_json(args.output_path)
        else:
            print("Error: Invalid output format: {csv, json}")
    else:
        if args.output_format == "csv":
            symbol_df.to_csv("symbols.csv", encoding="utf-8")
        elif args.output_format == "json":
            symbol_df.to_json("symbols.json")

    # print top 10
    print(f"identified {symbol_df.shape[0]} symbols")
    if symbol_df.shape[0] > 0:
        print("top 5 files by symbol count:")
        print(symbol_df["path"].value_counts().head())
        print("top 5 symbols by count:")
        print(symbol_df["symbol_name"].value_counts().head())
	# Copyright: Licensio, LLC 2022
	# License: AGPL-3.0

	import argparse
	import multiprocessing
	import os
	import subprocess
	import pandas


	def get_file_list(path: str, recursive: bool = False):
	"""
	Return a list of all files in a directory.
	:param path: path to the directory to get the files from
	:param recursive: if true, recursively get all files in the directory
	:return: a generator with file paths
	"""
	# switch on recursive arg
	if args.recursive:
	for directory_path, _, file_list in os.walk(args.path):
	for file_name in file_list:
	if os.path.isfile(os.path.join(directory_path, file_name)):
	yield os.path.join(directory_path, file_name)
	else:
	for p in os.listdir(args.path):
	if os.path.isfile(os.path.join(args.path, p)):
	yield os.path.join(args.path, p)


	def run_nm(file_path: str):
	"""
	Run the nm command on a file.
	:param file_path: path to the file to run the command on.
	:return: a list of dict entries containing the symbol info
	"""
	# run the command
	p = subprocess.Popen(
	["nm", "-a", "-A", "--special-syms", "--synthetic", file_path],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	)

	# get the output
	stdout_buffer, stderr_buffer = p.communicate()
	stdout_buffer = stdout_buffer.decode("utf-8")
	stderr_buffer = stderr_buffer.decode("utf-8")

	symbol_data = []
	for line in stdout_buffer.splitlines():
	line_tokens = line.split(":")
	line_symbol_tokens = line_tokens[-1].split()
	if len(line_tokens) == 2:
	if len(line_symbol_tokens) == 2:
	symbol_data.append(
	{
	"path": line_tokens[0],
	"object": None,
	"symbol_type": line_symbol_tokens[0],
	"symbol_name": line_symbol_tokens[1],
	}
	)
	else:
	symbol_data.append(
	{
	"path": line_tokens[0],
	"object": None,
	"symbol_type": line_symbol_tokens[1],
	"symbol_name": line_symbol_tokens[2],
	}
	)
	elif len(line_tokens) == 3:
	if len(line_symbol_tokens) == 2:
	symbol_data.append(
	{
	"path": line_tokens[0],
	"object": line_tokens[1],
	"symbol_type": line_symbol_tokens[0],
	"symbol_name": line_symbol_tokens[1],
	}
	)
	else:
	symbol_data.append(
	{
	"path": line_tokens[0],
	"object": line_tokens[1],
	"symbol_type": line_symbol_tokens[1],
	"symbol_name": line_symbol_tokens[2],
	}
	)
	else:
	print(len(line_tokens))

	return symbol_data


	if __name__ == "__main__":
	# parse CLI arguments
	parser = argparse.ArgumentParser(description="List local symbols in binaries")
	parser.add_argument("path", help="Path to begin search", type=str, default=".")
	parser.add_argument(
	"--recursive",
	help="Whether to search recursively",
	action="store_true",
	default=False,
	)
	parser.add_argument("--output-path", help="Path to output CSV file", type=str)
	parser.add_argument(
	"--output-format", help="Output format: {csv, json}", type=str, default="csv"
	)

	args = parser.parse_args()

	# setup and execute a pool of nm processes
	pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
	pool_results = pool.map(run_nm, get_file_list(args.path, args.recursive))

	# reduce map results into single list
	symbol_data = []
	for file_result in pool_results:
	if len(file_result) > 0:
	symbol_data.extend(file_result)

	# store results
	symbol_df = pandas.DataFrame(symbol_data)

	if args.output_path:
	if args.output_format == "csv":
	symbol_df.to_csv(args.output_path, encoding="utf-8")
	elif args.output_format == "json":
	symbol_df.to_json(args.output_path)
	else:
	print("Error: Invalid output format: {csv, json}")
	else:
	if args.output_format == "csv":
	symbol_df.to_csv("symbols.csv", encoding="utf-8")
	elif args.output_format == "json":
	symbol_df.to_json("symbols.json")

	# print top 10
	print(f"identified {symbol_df.shape[0]} symbols")
	if symbol_df.shape[0] > 0:
	print("top 5 files by symbol count:")
	print(symbol_df["path"].value_counts().head())
	print("top 5 symbols by count:")
	print(symbol_df["symbol_name"].value_counts().head())