Last active
January 10, 2021 13:56
-
-
Save bigsnarfdude/5e272447772a1949258fbf877c469f88 to your computer and use it in GitHub Desktop.
process_asm_files.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import glob | |
import cPickle as pickle | |
from multiprocessing import Process | |
import config as CFG | |
number_of_jobs = CFG.number_of_jobs | |
if sys.argv[1]=='train': | |
input_directory = CFG.training_data_directory_path | |
elif sys.argv[1]=='test': | |
input_directory = CFG.testing_data_directory_path | |
else: | |
print('Unknown option') | |
sys.exit() | |
def extract_filenames(input_directory): | |
unix_filenames = glob.glob(input_directory + '*.asm') | |
file_names = map(lambda x: x.split('/')[-1].split('.')[0], unix_filenames) | |
return file_names | |
def create_output_dir(path, folder): | |
dir_string = path + folder | |
if not os.path.exists(dir_string): | |
os.makedirs(dir_string) | |
return dir_string | |
output_directory_sections = create_output_dir(CFG.generated_features_directory_path, 'sections_hist/') | |
output_directory_asm = create_output_dir(CFG.generated_features_directory_path, 'spectral_asm/') | |
file_names = extract_filenames(input_directory) | |
def worker_extract_stats(fname, input_directory, output_directory_sections): | |
stat = {} | |
fin = open(input_directory + fname + '.asm', 'r') | |
for line in fin: | |
line_type = line.split(':')[0].lower() | |
stat[line_type] = stat.get(line_type, 0) + 1 | |
stat['sum'] = stat.get('sum', 0) + 1 | |
pickle.dump(stat, open(output_directory_sections + fname, 'w')) | |
def wrapper_extract_stats(fname_list, input_directory, output_directory_sections): | |
for fname in fname_list: | |
worker_extract_stats(fname, input_directory, output_directory_sections) | |
def worker_extract_op_stats(fname, input_directory, output_directory_asm): | |
subprocess.call('echo "{}" >> {}{}'.format(fname, output_directory_asm, 'fnames'), shell=True) | |
subprocess.call('cat {}{}.asm | wc -l >> {}{}'.format(input_directory, fname, output_directory_asm, 'line_count'), shell=True) | |
opz = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', | |
'retn', 'nop', 'sub', 'inc', 'dec', 'add', | |
'imul', 'xchg', 'or', 'shr', 'cmp', 'call', | |
'shl', 'ror', 'or', 'rol', 'jnb'] | |
for op in opz: | |
subprocess.call('grep "\s{}\s" {}{}.asm | wc -l >> {}{}'.format(op, input_directory, fname, output_directory_asm, op), shell=True) | |
# extraction of ASM file statistics of section counts | |
workers_extract_stats = [] | |
for worker_id in range(number_of_jobs): | |
p = Process(target=wrapper_extract_stats, | |
args=[[param for i, param in enumerate(file_names) if i % number_of_jobs == worker_id], | |
input_directory, | |
output_directory_sections]) | |
workers_extract_stats.append(p) | |
p.start() | |
for p in workers_extract_stats: | |
p.join() | |
# extraction of ASM file op statistics | |
for i, fname in enumerate(file_names): | |
worker_extract_op_stats(fname, input_directory, output_directory_asm) | |
if i%200==0: | |
print(i) | |
print('[[[ ASM stats and ops counts completed ]]]') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment