Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save trietptm/b0ebd61944c75f3852e0d1529a002b76 to your computer and use it in GitHub Desktop.
Save trietptm/b0ebd61944c75f3852e0d1529a002b76 to your computer and use it in GitHub Desktop.
process_asm_files.py
import os
import sys
import glob
import cPickle as pickle
from multiprocessing import Process
import config as CFG
number_of_jobs = CFG.number_of_jobs
if sys.argv[1]=='train':
input_directory = CFG.training_data_directory_path
elif sys.argv[1]=='test':
input_directory = CFG.testing_data_directory_path
else:
print('Unknown option')
sys.exit()
def extract_filenames(input_directory):
unix_filenames = glob.glob(input_directory + '*.asm')
file_names = map(lambda x: x.split('/')[-1].split('.')[0], unix_filenames)
return file_names
def create_output_dir(path, folder):
dir_string = path + folder
if not os.path.exists(dir_string):
os.makedirs(dir_string)
return dir_string
output_directory_sections = create_output_dir(CFG.generated_features_directory_path, 'sections_hist/')
output_directory_asm = create_output_dir(CFG.generated_features_directory_path, 'spectral_asm/')
file_names = extract_filenames(input_directory)
def worker_extract_stats(fname, input_directory, output_directory_sections):
stat = {}
fin = open(input_directory + fname + '.asm', 'r')
for line in fin:
line_type = line.split(':')[0].lower()
stat[line_type] = stat.get(line_type, 0) + 1
stat['sum'] = stat.get('sum', 0) + 1
pickle.dump(stat, open(output_directory_sections + fname, 'w'))
def wrapper_extract_stats(fname_list, input_directory, output_directory_sections):
for fname in fname_list:
worker_extract_stats(fname, input_directory, output_directory_sections)
def worker_extract_op_stats(fname, input_directory, output_directory_asm):
subprocess.call('echo "{}" >> {}{}'.format(fname, output_directory_asm, 'fnames'), shell=True)
subprocess.call('cat {}{}.asm | wc -l >> {}{}'.format(input_directory, fname, output_directory_asm, 'line_count'), shell=True)
opz = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor',
'retn', 'nop', 'sub', 'inc', 'dec', 'add',
'imul', 'xchg', 'or', 'shr', 'cmp', 'call',
'shl', 'ror', 'or', 'rol', 'jnb']
for op in opz:
subprocess.call('grep "\s{}\s" {}{}.asm | wc -l >> {}{}'.format(op, input_directory, fname, output_directory_asm, op), shell=True)
# extraction of ASM file statistics of section counts
workers_extract_stats = []
for worker_id in range(number_of_jobs):
p = Process(target=wrapper_extract_stats,
args=[[param for i, param in enumerate(file_names) if i % number_of_jobs == worker_id],
input_directory,
output_directory_sections])
workers_extract_stats.append(p)
p.start()
for p in workers_extract_stats:
p.join()
# extraction of ASM file op statistics
for i, fname in enumerate(file_names):
worker_extract_op_stats(fname, input_directory, output_directory_asm)
if i%200==0:
print(i)
print('[[[ ASM stats and ops counts completed ]]]')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment