cyanidium/file_duplicate_finder.py

## file_duplicate_finder.py
#!/usr/bin/env python

"""
Finds duplicate files based on MD5 hashsum
"""

import os
import os.path
import sys
import sqlite3
import hashlib
from optparse import OptionParser
from multiprocessing import Pool

#Default values
__PROCESSES__ = 20 #Seems about right to keep everything flowing
__DBFILE__ = "/tmp/FileDupeMatch.db"

#Parse options
parser = OptionParser(description=__doc__.strip())
parser.add_option("-d",
                  "--directory",
                  action="store",
                  type="string",
                  dest="files_dir",
                  default=False,
                  help="directory with files to search",
                  metavar="DIR")
parser.add_option("-o",
                  "--output",
                  action="store",
                  type="string",
                  dest="output_file",
                  default="",
                  help="where to save the duplicate list to",
                  metavar="FILE")
parser.add_option("-t",
                  "--threads",
                  action="store",
                  type="int",
                  dest="processes",
                  default=__PROCESSES__,
                  help="number of threads to use")
options, args = parser.parse_args()

def main():
    """
    Function loader.
    """

    #Find the duplicate songs
    if (options.files_dir and
        os.path.isdir(options.files_dir) and
        create_database()):
        multi_thread_dup_searcher()
        print_dups("files", "Duplicate files")
        os.remove(__DBFILE__)
    else:
        parser.print_help()
        print "Error: no directory given or %s already exists" % (__DBFILE__)

def create_database():
    """
    Create the database.
    """

    if os.path.exists(__DBFILE__):
        return False
    else:
        conn = sqlite3.connect(__DBFILE__)
        conn.text_factory = str
        c = conn.cursor()
        c.execute("create table files (id text, filename text)")
        conn.commit()
        conn.close()
        return True

def multi_thread_dup_searcher():
    """
    Uses threading to process multiple music files at once, reducing the time it
    takes to find the duplicates.
    """

    file_paths = find_files()

    #Start the tag corrector
    pool = Pool(processes=options.processes)
    r = pool.map_async(wrap_get_unique_ids, file_paths)
    pool.close()

    #Wait for everything to finish
    pool.join()

    #Show any files that did not successfully exit
    errors = r.get()
    #For ease of reading
    errors.sort()
    for error in errors:
        if error:
            print error

def find_files():
    """
    Creates a list of all music files.
    """

    file_paths = ()
    for root, dirs, files in os.walk(options.files_dir):
        for f in files:
            file_paths += os.path.join(root, f),
    return file_paths

def wrap_get_unique_ids(file_path):
    """
    Attempt to catch any errors and move on with the other files.
    """

    try:
        return get_unique_ids(file_path)
    except Exception, error:
        return "%s:: %s" % (file_path, error)

def get_unique_ids(file_path):
    """
    Main function.

    If a dictionary key has more than one value, there is a duplicate.
    """

    #Connect to the database
    conn = sqlite3.connect(__DBFILE__)
    conn.text_factory = str
    c = conn.cursor()

    with open(file_path, 'rb') as f:
        m = hashlib.md5()
        while True:
            data = f.read(8192)
            if not data:
                break
            m.update(data)

    c.execute("INSERT INTO files VALUES (?, ?)", (m.hexdigest(), file_path,))

    #Clean up
    conn.commit()
    conn.close()

    #No errors
    return False

def print_dups(dup_db, title):
    """
    Print a list of all duplicats. Will append to file if it already exists.
    STDOUT is the fallback if needed.
    """

    if options.output_file:
        try:
            output = open(options.output_file, 'a')
        except IOError:
            print "Couldn't open/create your output file, printing to STDOUT"
            output = sys.stdout
    else:
        print "No output file specified, printing to STDOUT"
        output = sys.stdout

    #Connect to the database
    conn = sqlite3.connect(__DBFILE__)
    conn.text_factory = str
    c1 = conn.cursor()
    #Use SQL to find the ids of the duplicates
    c1.execute("SELECT id FROM %s GROUP BY id HAVING (COUNT(id)>1)" % (dup_db))
    #Pretty output
    #output.write("\n")
    #output.write("*******************\n")
    #output.write(title)
    #output.write("\n")
    #output.write("*******************\n")
    for row1 in c1:
        c2 = conn.cursor()
        c2.execute("SELECT filename FROM %s WHERE id=? ORDER BY filename" % (dup_db), (row1[0],))
        for row2 in c2:
            output.write(row2[0])
            output.write("\n")
        c2.close()
        output.write("\n")
    output.write("\n")

    #Clean up
    conn.close()

if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	"""
	Finds duplicate files based on MD5 hashsum
	"""

	import os
	import os.path
	import sys
	import sqlite3
	import hashlib
	from optparse import OptionParser
	from multiprocessing import Pool

	#Default values
	__PROCESSES__ = 20 #Seems about right to keep everything flowing
	__DBFILE__ = "/tmp/FileDupeMatch.db"

	#Parse options
	parser = OptionParser(description=__doc__.strip())
	parser.add_option("-d",
	"--directory",
	action="store",
	type="string",
	dest="files_dir",
	default=False,
	help="directory with files to search",
	metavar="DIR")
	parser.add_option("-o",
	"--output",
	action="store",
	type="string",
	dest="output_file",
	default="",
	help="where to save the duplicate list to",
	metavar="FILE")
	parser.add_option("-t",
	"--threads",
	action="store",
	type="int",
	dest="processes",
	default=__PROCESSES__,
	help="number of threads to use")
	options, args = parser.parse_args()

	def main():
	"""
	Function loader.
	"""

	#Find the duplicate songs
	if (options.files_dir and
	os.path.isdir(options.files_dir) and
	create_database()):
	multi_thread_dup_searcher()
	print_dups("files", "Duplicate files")
	os.remove(__DBFILE__)
	else:
	parser.print_help()
	print "Error: no directory given or %s already exists" % (__DBFILE__)

	def create_database():
	"""
	Create the database.
	"""

	if os.path.exists(__DBFILE__):
	return False
	else:
	conn = sqlite3.connect(__DBFILE__)
	conn.text_factory = str
	c = conn.cursor()
	c.execute("create table files (id text, filename text)")
	conn.commit()
	conn.close()
	return True

	def multi_thread_dup_searcher():
	"""
	Uses threading to process multiple music files at once, reducing the time it
	takes to find the duplicates.
	"""

	file_paths = find_files()

	#Start the tag corrector
	pool = Pool(processes=options.processes)
	r = pool.map_async(wrap_get_unique_ids, file_paths)
	pool.close()

	#Wait for everything to finish
	pool.join()

	#Show any files that did not successfully exit
	errors = r.get()
	#For ease of reading
	errors.sort()
	for error in errors:
	if error:
	print error

	def find_files():
	"""
	Creates a list of all music files.
	"""

	file_paths = ()
	for root, dirs, files in os.walk(options.files_dir):
	for f in files:
	file_paths += os.path.join(root, f),
	return file_paths

	def wrap_get_unique_ids(file_path):
	"""
	Attempt to catch any errors and move on with the other files.
	"""

	try:
	return get_unique_ids(file_path)
	except Exception, error:
	return "%s:: %s" % (file_path, error)

	def get_unique_ids(file_path):
	"""
	Main function.

	If a dictionary key has more than one value, there is a duplicate.
	"""

	#Connect to the database
	conn = sqlite3.connect(__DBFILE__)
	conn.text_factory = str
	c = conn.cursor()

	with open(file_path, 'rb') as f:
	m = hashlib.md5()
	while True:
	data = f.read(8192)
	if not data:
	break
	m.update(data)

	c.execute("INSERT INTO files VALUES (?, ?)", (m.hexdigest(), file_path,))

	#Clean up
	conn.commit()
	conn.close()

	#No errors
	return False

	def print_dups(dup_db, title):
	"""
	Print a list of all duplicats. Will append to file if it already exists.
	STDOUT is the fallback if needed.
	"""

	if options.output_file:
	try:
	output = open(options.output_file, 'a')
	except IOError:
	print "Couldn't open/create your output file, printing to STDOUT"
	output = sys.stdout
	else:
	print "No output file specified, printing to STDOUT"
	output = sys.stdout

	#Connect to the database
	conn = sqlite3.connect(__DBFILE__)
	conn.text_factory = str
	c1 = conn.cursor()
	#Use SQL to find the ids of the duplicates
	c1.execute("SELECT id FROM %s GROUP BY id HAVING (COUNT(id)>1)" % (dup_db))
	#Pretty output
	#output.write("\n")
	#output.write("*******************\n")
	#output.write(title)
	#output.write("\n")
	#output.write("*******************\n")
	for row1 in c1:
	c2 = conn.cursor()
	c2.execute("SELECT filename FROM %s WHERE id=? ORDER BY filename" % (dup_db), (row1[0],))
	for row2 in c2:
	output.write(row2[0])
	output.write("\n")
	c2.close()
	output.write("\n")
	output.write("\n")

	#Clean up
	conn.close()

	if __name__ == '__main__':
	main()