ChefAndy/Droid spreadsheet diff report.py

## Droid spreadsheet diff report.py
import csv
from os import listdir
from os.path import isfile, join, basename
import re

"""
This simple script takes two directories of CSVs made by DROID and essentially diffs
them, assuming that the entries will have the same file names, same file size, same
immediate parent directory names, but different overall paths.

It's SUPER EXTRA commented on purpose, and designed to be more conceptually simple
than elegant on purpose.
"""


def main():
    """
       The main function pretty much just assigns the two lists of files from the
       spreadsheets to variables and compares them. Be sure you read through the
       code and comments in the procdir directory to get a better sense of how this
       processing is being done!
    """

    server = procdir('./server') #get the list of server hashes/files
    harddrive = procdir('./harddrive') #get the list of hard drive hashes/files

    found = 0 # this is to tally up the number of matched files
    orphans = {} # this is to store files that aren't matched
    for servermd5 in server: # loop through the md5sum entries in server
        for file in server[servermd5]: #loop through each file in the md5sum array
            """
            This is a little tricky. It sets the 'located' variable to False, and
            loops through the files listed under the same checksum in the harddrive
            dictionary, looking for a match. As soon as it finds an exact match, it
            will set the variable to True. If it never finds a match, the variable
            will remain set to False. After the loop, we check to see if the variable
            is set to false. If it is, then we know there wasn't an exact match in the
            harddrive list, so we add it to the 'orphans' list
            """

            located = False
            for hdfile in harddrive[servermd5]:
                if file == hdfile:
                    located = True

            if located == False:
                if servermd5 not in orphans: #need to make sure that the md5sum is set in orphans
                    orphans[servermd5] = []
                orphans[servermd5].append(hdfile)
            else:
                found = found + 1 # it was found, so update the found tally by adding 1 to itself

    # We're all done! Print out the results!
    print(orphans)
    print(found)


def procdir(mypath):
    """
    This function:
    iterates over the files in the given directory, for each file it
        makes sure it's an actual file
        makes sure it's a csv file
        opens each file and iterates over each row in the file
            get's each column and assigns the ones we want to keep to variables
            uses some processing to get the file name, and the parent folder
            writes some of the data to the filelist variable

    filelist[(md5checksum)][(numeric array key)]["parentfolder/filename/filetype/filesize/]

    We're grouping all of these files into their checksums, so filelist is a dictionary, and
    they keys to the dictionary are checksums. Since there may be multiple files
    with the same checksum, we allow an array of files to be stored under each checksum. In
    each array entry, there's a dictionary of file properites, including the parent folder,
    file type, file size, and file name.

    for example, checksum 5ac6684746b3998852cbc8793812fa92 has two files associated with it:
        205828-ZIP-158548-ZIP-0.winpe
        205828-ZIP-158548-ZIP-0_WZSHLEXT.DLL

    so we would access 205828-ZIP-158548-ZIP-0.winpe with the following variable
    filelist["5ac6684746b3998852cbc8793812fa92"][0]

    so we would access 205828-ZIP-158548-ZIP-0_WZSHLEXT.DLL with the following variable
    filelist["5ac6684746b3998852cbc8793812fa92"][1]

    in filelist["5ac6684746b3998852cbc8793812fa92"][0] is a dictionary of file properties:
    {'parentfolder': '000', 'filetype': 'File', 'filesize': '25088', 'filename': '205828-ZIP-158548-ZIP-0_WZSHLEXT.DLL'}

    in filelist["5ac6684746b3998852cbc8793812fa92"][1] is a dictionary of file properties:
    {'parentfolder': '000', 'filetype': 'File', 'filesize': '25088', 'filename': '205828-ZIP-158548-ZIP-0.winpe'}

    so if we wanted to print the parent folder of the 0th file with the checksum 5ac6684746b3998852cbc8793812fa92:

    print(filelist["5ac6684746b3998852cbc8793812fa92"][0]['parentfolder'])

    that would print '000' to the terminal.

    Here is a key of the columns in the csv files i was using
    row[2] is the URI
    row[3] is the FILE_PATH
    row[4] is the NAME
    row[5] is the METHOD
    row[6] is the STATUS
    row[7] is the SIZE
    row[8] is the TYPE
    row[9] is the EXT
    row[10] is the LAST_MODIFIED
    row[11] is the EXTENSION_MISMATCH
    row[12] is the MD5_HASH
    row[13] is the FORMAT_COUNT
    row[14] is the PUID
    row[15] is the MIME_TYPE
    row[16] is the FORMAT_NAME
    row[17] is the FORMAT_VERSION

    """
    filelist = {} #declare the filelist variable
    for f in listdir(mypath): #get the file names, assign each to 'f'
        if isfile(join(mypath, f)): #check if it's a real file
            if f.endswith('csv'): #check to see if the file string ends with csv
                with open(join(mypath, f), newline='') as csvfile: #open the file
                    reader = csv.reader(csvfile, delimiter=',', quotechar='"') #turn the csv into an array of rows
                    for row in reader: #loop through the rows

                        #grab some values from the columns
                        md5=row[12]
                        fullpath = row[3]
                        filetype = row[8]

                        # this will split row[3], the file path, into an array of things
                        # in between the \ character... so individual components of the
                        # file path
                        pathcomponents = re.split(r'\\', row[3])

                        # from pathcomponents, we get the second to the last one, which
                        # is the parent directory
                        parentfolder = pathcomponents[len(pathcomponents) - 2 ]

                        # here, we see if it's a regular file or a folder, and get the
                        # file/folder name using the appropriate method
                        # if it's neither, it's probably the column header, and should
                        # be skipped with 'continue'
                        if filetype == 'File' or filetype == 'Container':
                            filename = basename(row[2])
                        elif filetype == 'Folder':
                            filename = pathcomponents[len(pathcomponents) - 1 ]
                        else:
                            continue

                        # if the checksum isn't already in filelist, add it now, and make it an array
                        if md5 not in filelist:
                            filelist[md5] = []

                        # make the dictionary of file properties
                        file_dict = {
                            'parentfolder': parentfolder,
                            'filename': filename,
                            'filetype': filetype,
                            'filesize': row[7]
                        }

                        # append that dictionary to the array under filelist[md5]
                        filelist[md5].append(file_dict)

    return filelist

if __name__ == '__main__':
    main()
	import csv
	from os import listdir
	from os.path import isfile, join, basename
	import re

	"""
	This simple script takes two directories of CSVs made by DROID and essentially diffs
	them, assuming that the entries will have the same file names, same file size, same
	immediate parent directory names, but different overall paths.

	It's SUPER EXTRA commented on purpose, and designed to be more conceptually simple
	than elegant on purpose.
	"""


	def main():
	"""
	The main function pretty much just assigns the two lists of files from the
	spreadsheets to variables and compares them. Be sure you read through the
	code and comments in the procdir directory to get a better sense of how this
	processing is being done!
	"""

	server = procdir('./server') #get the list of server hashes/files
	harddrive = procdir('./harddrive') #get the list of hard drive hashes/files

	found = 0 # this is to tally up the number of matched files
	orphans = {} # this is to store files that aren't matched
	for servermd5 in server: # loop through the md5sum entries in server
	for file in server[servermd5]: #loop through each file in the md5sum array
	"""
	This is a little tricky. It sets the 'located' variable to False, and
	loops through the files listed under the same checksum in the harddrive
	dictionary, looking for a match. As soon as it finds an exact match, it
	will set the variable to True. If it never finds a match, the variable
	will remain set to False. After the loop, we check to see if the variable
	is set to false. If it is, then we know there wasn't an exact match in the
	harddrive list, so we add it to the 'orphans' list
	"""

	located = False
	for hdfile in harddrive[servermd5]:
	if file == hdfile:
	located = True

	if located == False:
	if servermd5 not in orphans: #need to make sure that the md5sum is set in orphans
	orphans[servermd5] = []
	orphans[servermd5].append(hdfile)
	else:
	found = found + 1 # it was found, so update the found tally by adding 1 to itself

	# We're all done! Print out the results!
	print(orphans)
	print(found)


	def procdir(mypath):
	"""
	This function:
	iterates over the files in the given directory, for each file it
	makes sure it's an actual file
	makes sure it's a csv file
	opens each file and iterates over each row in the file
	get's each column and assigns the ones we want to keep to variables
	uses some processing to get the file name, and the parent folder
	writes some of the data to the filelist variable

	filelist[(md5checksum)][(numeric array key)]["parentfolder/filename/filetype/filesize/]

	We're grouping all of these files into their checksums, so filelist is a dictionary, and
	they keys to the dictionary are checksums. Since there may be multiple files
	with the same checksum, we allow an array of files to be stored under each checksum. In
	each array entry, there's a dictionary of file properites, including the parent folder,
	file type, file size, and file name.

	for example, checksum 5ac6684746b3998852cbc8793812fa92 has two files associated with it:
	205828-ZIP-158548-ZIP-0.winpe
	205828-ZIP-158548-ZIP-0_WZSHLEXT.DLL

	so we would access 205828-ZIP-158548-ZIP-0.winpe with the following variable
	filelist["5ac6684746b3998852cbc8793812fa92"][0]

	so we would access 205828-ZIP-158548-ZIP-0_WZSHLEXT.DLL with the following variable
	filelist["5ac6684746b3998852cbc8793812fa92"][1]

	in filelist["5ac6684746b3998852cbc8793812fa92"][0] is a dictionary of file properties:
	{'parentfolder': '000', 'filetype': 'File', 'filesize': '25088', 'filename': '205828-ZIP-158548-ZIP-0_WZSHLEXT.DLL'}

	in filelist["5ac6684746b3998852cbc8793812fa92"][1] is a dictionary of file properties:
	{'parentfolder': '000', 'filetype': 'File', 'filesize': '25088', 'filename': '205828-ZIP-158548-ZIP-0.winpe'}

	so if we wanted to print the parent folder of the 0th file with the checksum 5ac6684746b3998852cbc8793812fa92:

	print(filelist["5ac6684746b3998852cbc8793812fa92"][0]['parentfolder'])

	that would print '000' to the terminal.

	Here is a key of the columns in the csv files i was using
	row[2] is the URI
	row[3] is the FILE_PATH
	row[4] is the NAME
	row[5] is the METHOD
	row[6] is the STATUS
	row[7] is the SIZE
	row[8] is the TYPE
	row[9] is the EXT
	row[10] is the LAST_MODIFIED
	row[11] is the EXTENSION_MISMATCH
	row[12] is the MD5_HASH
	row[13] is the FORMAT_COUNT
	row[14] is the PUID
	row[15] is the MIME_TYPE
	row[16] is the FORMAT_NAME
	row[17] is the FORMAT_VERSION

	"""
	filelist = {} #declare the filelist variable
	for f in listdir(mypath): #get the file names, assign each to 'f'
	if isfile(join(mypath, f)): #check if it's a real file
	if f.endswith('csv'): #check to see if the file string ends with csv
	with open(join(mypath, f), newline='') as csvfile: #open the file
	reader = csv.reader(csvfile, delimiter=',', quotechar='"') #turn the csv into an array of rows
	for row in reader: #loop through the rows

	#grab some values from the columns
	md5=row[12]
	fullpath = row[3]
	filetype = row[8]

	# this will split row[3], the file path, into an array of things
	# in between the \ character... so individual components of the
	# file path
	pathcomponents = re.split(r'\\', row[3])

	# from pathcomponents, we get the second to the last one, which
	# is the parent directory
	parentfolder = pathcomponents[len(pathcomponents) - 2 ]

	# here, we see if it's a regular file or a folder, and get the
	# file/folder name using the appropriate method
	# if it's neither, it's probably the column header, and should
	# be skipped with 'continue'
	if filetype == 'File' or filetype == 'Container':
	filename = basename(row[2])
	elif filetype == 'Folder':
	filename = pathcomponents[len(pathcomponents) - 1 ]
	else:
	continue

	# if the checksum isn't already in filelist, add it now, and make it an array
	if md5 not in filelist:
	filelist[md5] = []

	# make the dictionary of file properties
	file_dict = {
	'parentfolder': parentfolder,
	'filename': filename,
	'filetype': filetype,
	'filesize': row[7]
	}

	# append that dictionary to the array under filelist[md5]
	filelist[md5].append(file_dict)

	return filelist

	if __name__ == '__main__':
	main()