-
-
Save platomav/bfe567b0a810bd0d5294ac6e83cdde19 to your computer and use it in GitHub Desktop.
Extract ASCII and Unicode strings using Python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#coding=utf-8 | |
""" | |
Binary Text Extract | |
Binary ASCII/Unicode Extractor | |
Copyright (C) 2021 Plato Mavropoulos | |
Based on https://gist.github.com/williballenthin/8e3913358a7996eab9b96bd57fc59df2 by Willi Ballenthin | |
""" | |
title = 'Binary ASCII/Unicode Extractor v1.0' | |
print('\n' + title) | |
import sys | |
# Detect Python version | |
sys_ver = sys.version_info | |
if sys_ver < (3,7) : | |
sys.stdout.write('\n\nError: Python >= 3.7 required, not %d.%d!\n' % (sys_ver[0], sys_ver[1])) | |
(raw_input if sys_ver[0] <= 2 else input)('\nPress enter to exit') # pylint: disable=E0602 | |
sys.exit(1) | |
import os | |
import re | |
import ctypes | |
import argparse | |
import traceback | |
import collections | |
# Pause after any unexpected Python exception | |
# https://stackoverflow.com/a/781074 by Torsten Marek | |
def show_exception_and_exit(exc_type, exc_value, tb) : | |
if exc_type is KeyboardInterrupt : | |
print('\n') | |
else : | |
print('\nError: %s crashed, please report the following:\n' % title) | |
traceback.print_exception(exc_type, exc_value, tb) | |
input('\nPress enter to exit') | |
sys.exit(1) | |
# Set pause-able Python exception handler | |
sys.excepthook = show_exception_and_exit | |
# Set console/shell window title | |
user_os = sys.platform | |
if user_os == 'win32' : ctypes.windll.kernel32.SetConsoleTitleW(title) | |
elif user_os.startswith('linux') or user_os == 'darwin' or user_os.find('bsd') != -1 : sys.stdout.write('\x1b]2;' + title + '\x07') | |
# Set argparse Arguments | |
text_extractor = argparse.ArgumentParser() | |
text_extractor.add_argument('files', type=argparse.FileType('r'), nargs='*') | |
text_extractor.add_argument('-p', '--path', help='parse files within given folder', type=str) | |
text_extractor.add_argument('-s', '--size', help='find text of given size or more (default is 4)', type=int) | |
text_params = text_extractor.parse_args() | |
# Get all files within path | |
def get_files(path) : | |
inputs = [] | |
for root, _, files in os.walk(path): | |
for name in files : | |
inputs.append(os.path.join(root, name)) | |
return inputs | |
if len(sys.argv) >= 2 : | |
if bool(text_params.path) : | |
bin_files = get_files(text_params.path) # CLI with --path | |
else : | |
bin_files = [] | |
for executable in text_params.files : | |
bin_files.append(executable.name) # Drag & Drop | |
else : | |
in_path = input('\nEnter the full folder path: ') | |
bin_files = get_files(in_path) # Direct Run | |
ASCII_BYTE = rb' !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t' | |
String = collections.namedtuple('String', ['s', 'offset']) | |
char_count = text_params.size if bool(text_params.size) else 4 | |
# Get ASCII Strings | |
def ascii_strings(buffer, char_count) : | |
reg = rb'([%s]{%d,})' % (ASCII_BYTE, char_count) | |
ascii_re = re.compile(reg) | |
for match in ascii_re.finditer(buffer) : | |
yield String(match.group().decode('ascii'), match.start()) | |
# Get Unicode Strings | |
def unicode_strings(buffer, char_count) : | |
reg = rb'((?:[%s]\x00){%d,})' % (ASCII_BYTE, char_count) | |
uni_re = re.compile(reg) | |
for match in uni_re.finditer(buffer) : | |
try : | |
yield String(match.group().decode('utf-16'), match.start()) | |
except UnicodeDecodeError: | |
pass | |
def main() : | |
for input_file in bin_files : | |
input_name,input_extension = os.path.splitext(os.path.basename(input_file)) | |
print('\n*** %s%s' % (input_name, input_extension)) | |
if not os.path.isfile(input_file) : | |
print('\n Error: Cannot find input file %s%s!' % (input_name, input_extension)) | |
continue | |
with open(input_file, 'rb') as in_file : buffer = in_file.read() | |
output_text = '' | |
output_count = 0 | |
for s in ascii_strings(buffer, char_count) : | |
output_text += '0x{:08X}: {:s}\n'.format(s.offset, s.s) | |
output_count += 1 | |
for s in unicode_strings(buffer, char_count) : | |
output_text += '0x{:08X}: {:s}\n'.format(s.offset, s.s) | |
output_count += 1 | |
input_dir = os.path.dirname(os.path.abspath(input_file)) | |
output_path = os.path.join(input_dir, input_name + input_extension + '.txt') | |
with open(output_path, 'w', encoding='utf-8') as out : out.write(output_text) | |
print('\n Extracted %d ASCII/Unicode line(s) of length >= %d!' % (output_count, char_count)) | |
if __name__ == '__main__' : | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment