RyanMagnusson/md5walk.py

## md5walk.py
from typing import Union, Optional, List, Dict, Set
from enum import Enum
from os import walk
from os.path import realpath, join, isfile, isdir, basename, getsize
from hashlib import md5,sha256
from argparse import ArgumentParser
from json import loads as unpickle, dumps as pickle

import sys


def is_blank(text:Union[str,chr] = None, cast:bool = True) -> bool:
    if text is None:
        return True

    if not isinstance(text, (str,chr)) and not cast:
        return False

    _text = str(text)
    for c in _text:
        if c != ' ' and c != '\t' and c != '\n' and c != '\r' and c != chr(160):
            return False

    return True

def is_not_blank(text:Union[str,chr] = None) -> bool:
    return not is_blank(text)

def trim_or_None_if_empty(text = None, strict:bool = False) -> Optional[str]:
    if text is None:
        return None

    if strict and not isinstance(text, (str,chr)):
        as_text = None
        try:
            as_text = str(text)
            if as_text:
                as_text = ": " + as_text
        except:
            pass
        raise ValueError('A string or unicode character must be provided. A {} was given instead{}'.format(type(text), as_text))

    trimmed = str(text).strip()
    if len(trimmed) < 1:
        return None

    return trimmed

class Algorithm(Enum):
    SHA256 = 'sha-256'
    MD5 = 'md5'

    @classmethod
    def values(cls) -> List['Algorithm']:
        return (Algorithm.SHA256, Algorithm.MD5)

    @classmethod
    def from_string(cls, text: str):
        trimmed = trim_or_None_if_empty(text)
        if trimmed is None:
            return None

        trimmed = trimmed.upper()
        for t in cls.values():
            if (t.name == trimmed
                or t.value.upper() == trimmed
                or t.value.replace('-', '_').upper() == trimmed):
                return t

        return None


class File(object):
    def __init__(self, path:str, checksums:Dict[Algorithm,str] = {}):
        self.path = path
        self.checksums = checksums or {}
        self.size = 0

    @property
    def path(self):
        return self.__path

    @path.setter
    def path(self, value:str = None) -> 'File':
        self.__path = trim_or_None_if_empty(value)
        return self

    @property
    def name(self):
        if self.path:
            return basename(self.path)

    @property
    def size(self):
        return self.__size

    @size.setter
    def size(self, bytes:int = None):
        self.__size = 0 if bytes is None or bytes < 0 else bytes

    def __getstate__(self):
        data = {'py/object': self.__class__.__name__ }
        data['path'] = self.path
        return data

    def __str__(self):
        try:
            return pickle(self)
        except:
            return self.path


class Column(Enum):
    PATH = 'Path'
    DIRECTORY = 'Directory'
    NAME = 'Name'
    SHA256 = 'SHA-256'
    MD5 = 'MD5'
    SIZE = "Size"

    @classmethod
    def values(cls) -> List['Column']:
        return (Column.NAME, Column.SHA256, Column.MD5, Column.PATH, Column.DIRECTORY, Column.SIZE)

    @classmethod
    def from_string(cls, text: str):
        trimmed = trim_or_None_if_empty(text)
        if trimmed is None:
            return None

        trimmed = trimmed.upper()
        for t in cls.values():
            if (t.name == trimmed
                or t.value.upper() == trimmed
                or t.value.replace('-', '_').upper() == trimmed):
                return t

        return None


def calculate_checksum(file:str, algorithm:Algorithm):
    hash = None
    if algorithm is Algorithm.MD5:
        hash = md5()
    elif algorithm is Algorithm.SHA256:
        hash = sha256()
    else:
        raise ValueError('Unsuported algorithm: ' + str(algorithm))

    with open(file, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash.update(chunk)
        return hash.hexdigest()

arg_parser = ArgumentParser(description='Calculate md5 or sha256 of file(s)')
arg_parser.add_argument('file')
arg_parser.add_argument('--sha256', action='store_true', default=False)
arg_parser.add_argument('--md5', action='store_true', default=True)


args = arg_parser.parse_args()

def format_files(files:List[File] = [], columns:List[Column] = None, delimiter:chr = '|') -> str:
    _columns = columns
    if columns is None or len(columns) < 1:
        colSet = set([])
        for f in files:
            if is_not_blank(f.path):
                colSet.add(Column.PATH)
            elif is_not_blank(f.name):
                colSet.add(Column.NAME)

            if f.checksums is not None and len(f.checksums) > 0:
                if f.checksums.get(Algorithm.SHA256,None) is not None:
                    colSet.add(Column.SHA256)

                if f.checksums.get(Algorithm.MD5,None) is not None:
                    colSet.add(Column.MD5)

        _columns = list(colSet)

    for f in files:
        print(format_file(f, _columns, delimiter), file=sys.stdout)


def format_file(file:File = None, columns:List[Column] = [Column.PATH, Column.SHA256, Column.MD5, Column.SIZE], delimiter:chr = '|') -> str:
    if file is None:
        return ''

    if columns is None or len(columns) < 1:
        return ''

    result = ''
    for col in columns:
        if result:
            result = result + delimiter
        if col is Column.PATH:
            result = result + ('' if file.path is None else file.path)
        elif col is Column.DIRECTORY:
            result = result + ('' if file.directory is None else file.directory)
        elif col is Column.NAME:
            result = result + ('' if file.name is None else file.name)
        elif col is Column.SHA256:
            sha256_checksum = None if file.checksums is None else file.checksums.get(Algorithm.SHA256, None)
            result = result + ('' if sha256_checksum is None else sha256_checksum)
        elif col is Column.MD5:
            md5_checksum = None if file.checksums is None else file.checksums.get(Algorithm.MD5, None)
            result = result + ('' if md5_checksum is None else md5_checksum)
        elif col is Column.SIZE:
            result = result + '0' if file.size is None else str(file.size)
        else:
            raise ValueError('Unsupported column for formatting')
    return result

file_paths = []  # List which will store all of the full filepaths.
if isfile(args.file):
    f = File(realpath(args.file))
    if args.md5:
        f.checksums[Algorithm.MD5] = calculate_checksum(f.path, Algorithm.MD5)
    if args.sha256:
        f.checksums[Algorithm.SHA256] = calculate_checksum(f.path, Algorithm.SHA256)

    file_paths.append(f)
else:
    # Walk the tree.
    for root, directories, files in walk(args.file):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = join(root, filename)
            f = File(realpath(filepath))
            if args.md5:
                f.checksums[Algorithm.MD5] = calculate_checksum(f.path, Algorithm.MD5)
            if args.sha256:
                f.checksums[Algorithm.SHA256] = calculate_checksum(f.path, Algorithm.SHA256)
            f.size = getsize(filepath)
            file_paths.append(f)

format_files(file_paths)
	from typing import Union, Optional, List, Dict, Set
	from enum import Enum
	from os import walk
	from os.path import realpath, join, isfile, isdir, basename, getsize
	from hashlib import md5,sha256
	from argparse import ArgumentParser
	from json import loads as unpickle, dumps as pickle

	import sys


	def is_blank(text:Union[str,chr] = None, cast:bool = True) -> bool:
	if text is None:
	return True

	if not isinstance(text, (str,chr)) and not cast:
	return False

	_text = str(text)
	for c in _text:
	if c != ' ' and c != '\t' and c != '\n' and c != '\r' and c != chr(160):
	return False

	return True

	def is_not_blank(text:Union[str,chr] = None) -> bool:
	return not is_blank(text)

	def trim_or_None_if_empty(text = None, strict:bool = False) -> Optional[str]:
	if text is None:
	return None

	if strict and not isinstance(text, (str,chr)):
	as_text = None
	try:
	as_text = str(text)
	if as_text:
	as_text = ": " + as_text
	except:
	pass
	raise ValueError('A string or unicode character must be provided. A {} was given instead{}'.format(type(text), as_text))

	trimmed = str(text).strip()
	if len(trimmed) < 1:
	return None

	return trimmed

	class Algorithm(Enum):
	SHA256 = 'sha-256'
	MD5 = 'md5'

	@classmethod
	def values(cls) -> List['Algorithm']:
	return (Algorithm.SHA256, Algorithm.MD5)

	@classmethod
	def from_string(cls, text: str):
	trimmed = trim_or_None_if_empty(text)
	if trimmed is None:
	return None

	trimmed = trimmed.upper()
	for t in cls.values():
	if (t.name == trimmed
	or t.value.upper() == trimmed
	or t.value.replace('-', '_').upper() == trimmed):
	return t

	return None


	class File(object):
	def __init__(self, path:str, checksums:Dict[Algorithm,str] = {}):
	self.path = path
	self.checksums = checksums or {}
	self.size = 0

	@property
	def path(self):
	return self.__path

	@path.setter
	def path(self, value:str = None) -> 'File':
	self.__path = trim_or_None_if_empty(value)
	return self

	@property
	def name(self):
	if self.path:
	return basename(self.path)

	@property
	def size(self):
	return self.__size

	@size.setter
	def size(self, bytes:int = None):
	self.__size = 0 if bytes is None or bytes < 0 else bytes

	def __getstate__(self):
	data = {'py/object': self.__class__.__name__ }
	data['path'] = self.path
	return data

	def __str__(self):
	try:
	return pickle(self)
	except:
	return self.path


	class Column(Enum):
	PATH = 'Path'
	DIRECTORY = 'Directory'
	NAME = 'Name'
	SHA256 = 'SHA-256'
	MD5 = 'MD5'
	SIZE = "Size"

	@classmethod
	def values(cls) -> List['Column']:
	return (Column.NAME, Column.SHA256, Column.MD5, Column.PATH, Column.DIRECTORY, Column.SIZE)

	@classmethod
	def from_string(cls, text: str):
	trimmed = trim_or_None_if_empty(text)
	if trimmed is None:
	return None

	trimmed = trimmed.upper()
	for t in cls.values():
	if (t.name == trimmed
	or t.value.upper() == trimmed
	or t.value.replace('-', '_').upper() == trimmed):
	return t

	return None


	def calculate_checksum(file:str, algorithm:Algorithm):
	hash = None
	if algorithm is Algorithm.MD5:
	hash = md5()
	elif algorithm is Algorithm.SHA256:
	hash = sha256()
	else:
	raise ValueError('Unsuported algorithm: ' + str(algorithm))

	with open(file, "rb") as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hash.update(chunk)
	return hash.hexdigest()

	arg_parser = ArgumentParser(description='Calculate md5 or sha256 of file(s)')
	arg_parser.add_argument('file')
	arg_parser.add_argument('--sha256', action='store_true', default=False)
	arg_parser.add_argument('--md5', action='store_true', default=True)


	args = arg_parser.parse_args()

	def format_files(files:List[File] = [], columns:List[Column] = None, delimiter:chr = '\|') -> str:
	_columns = columns
	if columns is None or len(columns) < 1:
	colSet = set([])
	for f in files:
	if is_not_blank(f.path):
	colSet.add(Column.PATH)
	elif is_not_blank(f.name):
	colSet.add(Column.NAME)

	if f.checksums is not None and len(f.checksums) > 0:
	if f.checksums.get(Algorithm.SHA256,None) is not None:
	colSet.add(Column.SHA256)

	if f.checksums.get(Algorithm.MD5,None) is not None:
	colSet.add(Column.MD5)

	_columns = list(colSet)

	for f in files:
	print(format_file(f, _columns, delimiter), file=sys.stdout)


	def format_file(file:File = None, columns:List[Column] = [Column.PATH, Column.SHA256, Column.MD5, Column.SIZE], delimiter:chr = '\|') -> str:
	if file is None:
	return ''

	if columns is None or len(columns) < 1:
	return ''

	result = ''
	for col in columns:
	if result:
	result = result + delimiter
	if col is Column.PATH:
	result = result + ('' if file.path is None else file.path)
	elif col is Column.DIRECTORY:
	result = result + ('' if file.directory is None else file.directory)
	elif col is Column.NAME:
	result = result + ('' if file.name is None else file.name)
	elif col is Column.SHA256:
	sha256_checksum = None if file.checksums is None else file.checksums.get(Algorithm.SHA256, None)
	result = result + ('' if sha256_checksum is None else sha256_checksum)
	elif col is Column.MD5:
	md5_checksum = None if file.checksums is None else file.checksums.get(Algorithm.MD5, None)
	result = result + ('' if md5_checksum is None else md5_checksum)
	elif col is Column.SIZE:
	result = result + '0' if file.size is None else str(file.size)
	else:
	raise ValueError('Unsupported column for formatting')
	return result

	file_paths = [] # List which will store all of the full filepaths.
	if isfile(args.file):
	f = File(realpath(args.file))
	if args.md5:
	f.checksums[Algorithm.MD5] = calculate_checksum(f.path, Algorithm.MD5)
	if args.sha256:
	f.checksums[Algorithm.SHA256] = calculate_checksum(f.path, Algorithm.SHA256)

	file_paths.append(f)
	else:
	# Walk the tree.
	for root, directories, files in walk(args.file):
	for filename in files:
	# Join the two strings in order to form the full filepath.
	filepath = join(root, filename)
	f = File(realpath(filepath))
	if args.md5:
	f.checksums[Algorithm.MD5] = calculate_checksum(f.path, Algorithm.MD5)
	if args.sha256:
	f.checksums[Algorithm.SHA256] = calculate_checksum(f.path, Algorithm.SHA256)
	f.size = getsize(filepath)
	file_paths.append(f)

	format_files(file_paths)