Skip to content

Instantly share code, notes, and snippets.

@walkerh
Created March 15, 2016 22:52
Show Gist options
  • Save walkerh/606fb49e2ad8059d44f5 to your computer and use it in GitHub Desktop.
Save walkerh/606fb49e2ad8059d44f5 to your computer and use it in GitHub Desktop.
Recursively scan the specified directory, emitting manifest of files.
#!/usr/bin/env python3
"""Recursively scan the specified directory, emitting manifest of files.
Writes to file or stdout.
Requires Python 2.7+
"""
import argparse
from hashlib import md5 as hash
import io
import logging
import os
import stat
import sys
K = 1024
M = K * K
BLOCK_SIZE = 16 * K
SIZE_THRESHOLD = M
def main():
args = parse_args()
config_logging(args)
run(args)
logging.shutdown()
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('root_dir_path')
parser.add_argument('output_file', type=argparse.FileType('w'),
nargs='?', default=sys.stdout)
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args()
return args
def config_logging(args):
global logger
level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=level)
logger = logging.getLogger('scanner')
def run(args):
logger.info('starting scan %r', args)
scan_dir(args.root_dir_path, args.output_file)
logger.info('finished scan %r', args)
def scan_dir(root_dir_path, output_file):
base = os.path.realpath(root_dir_path)
for dir_path, dir_names, file_names in os.walk(base):
for file_name in file_names:
file_path = os.path.join(dir_path, file_name)
is_regular, inode, num_links, file_size = stat_file(file_path)
if is_regular:
file_hash = hash_file(file_path, file_size)
output_file.write(
'{}\t{}\t{}\t{}\t{}\n'.format(file_size, file_hash,
num_links, inode, file_path)
)
def stat_file(file_path):
s = os.lstat(file_path)
is_regular = stat.S_ISREG(s.st_mode)
file_size = s.st_size
num_links = s.st_nlink
inode = s.st_ino
return is_regular, inode, num_links, file_size
def hash_file(file_path, file_size):
"""docstring for hash_file"""
h = hash()
with open(file_path, 'rb') as input_file:
if file_size <= SIZE_THRESHOLD:
data = True
while data:
data = input_file.read(BLOCK_SIZE)
h.update(data)
else:
data = input_file.read(BLOCK_SIZE)
h.update(data)
input_file.seek(file_size // 2)
data = input_file.read(BLOCK_SIZE)
h.update(data)
input_file.seek(file_size - BLOCK_SIZE)
data = input_file.read(BLOCK_SIZE)
h.update(data)
file_hash = 'whv01-' + h.hexdigest()[:10]
return file_hash
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment