Skip to content

Instantly share code, notes, and snippets.

@KhasMek
Last active October 30, 2019 16:31
Show Gist options
  • Save KhasMek/0031f015f155801e538efd8e57abb49a to your computer and use it in GitHub Desktop.
Save KhasMek/0031f015f155801e538efd8e57abb49a to your computer and use it in GitHub Desktop.
Create a report on the specified directory with two worksheets as the output. One worksheet being unique file(s)/hashes, and the second being duplicate file(s)/hashes.
#!/usr/bin/python
import hashlib
import json
import os
import sys
import xlsxwriter
hashdict = {}
def return_hash(fname):
md5 = hashlib.md5()
sha1 = hashlib.sha1()
with open(fname, 'rb') as f:
data = f.read(65536)
if data:
md5.update(data)
sha1.update(data)
return sha1.hexdigest()
def write_workbook(hashdict, outfile='results.xlsx'):
workbook = xlsxwriter.Workbook(outfile)
common_formatting = {'text_wrap': 1,'font_name': 'Arial', 'font_size': 10, 'align': 'center', 'valign': 'vcenter', 'border': 6, 'border_color': 'gray'}
header = workbook.add_format({**common_formatting, 'bold': 1, 'bg_color': '#366092', 'font_color': 'white'})
body = workbook.add_format(common_formatting)
unique_ws = workbook.add_worksheet('Unique Hashes')
dup_ws = workbook.add_worksheet('Duplicate Hashes')
unique_row = 1
dup_row = 1
for k,v in hashdict.items():
if len(v) is 1:
unique = True
worksheet = unique_ws
row = unique_row
elif len(v) > 1:
unique = False
worksheet = dup_ws
row = dup_row
worksheet.set_row(row)
worksheet.write(row, 0, k)
worksheet.write(row, 1, '\n'.join(v))
if unique:
unique_row += 1
else:
dup_row += 1
for worksheet in workbook.worksheets():
worksheet.set_column(0, 0, 45)
worksheet.set_column(1, 1, 100)
worksheet.write_row(0, 0, ['Hash', "File(s)"], header)
if worksheet.get_name() is 'Unique Hashes':
last_row = unique_row
elif worksheet.get_name() is 'Duplicate Hashes':
last_row = dup_row
worksheet.autofilter(0, 0, last_row, 2)
workbook.close()
print("Created: {}".format(outfile))
def main(_dir, type_filter=None):
for root, dirs, files in os.walk(_dir):
for file in files:
fname = os.path.join(root, file)
fhash = None
if type_filter and file.endswith(type_filter):
fhash = return_hash(fname)
elif type_filter is None:
fhash = return_hash(fname)
if fhash:
if fhash not in hashdict:
hashdict[fhash] = [fname]
else:
hashdict[fhash].append(fname)
write_workbook(hashdict)
if __name__ == "__main__":
try:
_dir = sys.argv[1]
except IndexError:
_dir = '.'
try:
type_filter = sys.argv[2]
print('File type filter: {}'.format(type_filter))
main(_dir, type_filter)
except IndexError:
main(_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment