Created
October 19, 2014 17:46
-
-
Save cyanidium/67aa986f1cd535d2f996 to your computer and use it in GitHub Desktop.
Old script that could do with some updates (like a safer tmp file), but sufficiently matches files and tells you about duplicates.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Finds duplicate files based on MD5 hashsum | |
""" | |
import os | |
import os.path | |
import sys | |
import sqlite3 | |
import hashlib | |
from optparse import OptionParser | |
from multiprocessing import Pool | |
#Default values | |
__PROCESSES__ = 20 #Seems about right to keep everything flowing | |
__DBFILE__ = "/tmp/FileDupeMatch.db" | |
#Parse options | |
parser = OptionParser(description=__doc__.strip()) | |
parser.add_option("-d", | |
"--directory", | |
action="store", | |
type="string", | |
dest="files_dir", | |
default=False, | |
help="directory with files to search", | |
metavar="DIR") | |
parser.add_option("-o", | |
"--output", | |
action="store", | |
type="string", | |
dest="output_file", | |
default="", | |
help="where to save the duplicate list to", | |
metavar="FILE") | |
parser.add_option("-t", | |
"--threads", | |
action="store", | |
type="int", | |
dest="processes", | |
default=__PROCESSES__, | |
help="number of threads to use") | |
options, args = parser.parse_args() | |
def main(): | |
""" | |
Function loader. | |
""" | |
#Find the duplicate songs | |
if (options.files_dir and | |
os.path.isdir(options.files_dir) and | |
create_database()): | |
multi_thread_dup_searcher() | |
print_dups("files", "Duplicate files") | |
os.remove(__DBFILE__) | |
else: | |
parser.print_help() | |
print "Error: no directory given or %s already exists" % (__DBFILE__) | |
def create_database(): | |
""" | |
Create the database. | |
""" | |
if os.path.exists(__DBFILE__): | |
return False | |
else: | |
conn = sqlite3.connect(__DBFILE__) | |
conn.text_factory = str | |
c = conn.cursor() | |
c.execute("create table files (id text, filename text)") | |
conn.commit() | |
conn.close() | |
return True | |
def multi_thread_dup_searcher(): | |
""" | |
Uses threading to process multiple music files at once, reducing the time it | |
takes to find the duplicates. | |
""" | |
file_paths = find_files() | |
#Start the tag corrector | |
pool = Pool(processes=options.processes) | |
r = pool.map_async(wrap_get_unique_ids, file_paths) | |
pool.close() | |
#Wait for everything to finish | |
pool.join() | |
#Show any files that did not successfully exit | |
errors = r.get() | |
#For ease of reading | |
errors.sort() | |
for error in errors: | |
if error: | |
print error | |
def find_files(): | |
""" | |
Creates a list of all music files. | |
""" | |
file_paths = () | |
for root, dirs, files in os.walk(options.files_dir): | |
for f in files: | |
file_paths += os.path.join(root, f), | |
return file_paths | |
def wrap_get_unique_ids(file_path): | |
""" | |
Attempt to catch any errors and move on with the other files. | |
""" | |
try: | |
return get_unique_ids(file_path) | |
except Exception, error: | |
return "%s:: %s" % (file_path, error) | |
def get_unique_ids(file_path): | |
""" | |
Main function. | |
If a dictionary key has more than one value, there is a duplicate. | |
""" | |
#Connect to the database | |
conn = sqlite3.connect(__DBFILE__) | |
conn.text_factory = str | |
c = conn.cursor() | |
with open(file_path, 'rb') as f: | |
m = hashlib.md5() | |
while True: | |
data = f.read(8192) | |
if not data: | |
break | |
m.update(data) | |
c.execute("INSERT INTO files VALUES (?, ?)", (m.hexdigest(), file_path,)) | |
#Clean up | |
conn.commit() | |
conn.close() | |
#No errors | |
return False | |
def print_dups(dup_db, title): | |
""" | |
Print a list of all duplicats. Will append to file if it already exists. | |
STDOUT is the fallback if needed. | |
""" | |
if options.output_file: | |
try: | |
output = open(options.output_file, 'a') | |
except IOError: | |
print "Couldn't open/create your output file, printing to STDOUT" | |
output = sys.stdout | |
else: | |
print "No output file specified, printing to STDOUT" | |
output = sys.stdout | |
#Connect to the database | |
conn = sqlite3.connect(__DBFILE__) | |
conn.text_factory = str | |
c1 = conn.cursor() | |
#Use SQL to find the ids of the duplicates | |
c1.execute("SELECT id FROM %s GROUP BY id HAVING (COUNT(id)>1)" % (dup_db)) | |
#Pretty output | |
#output.write("\n") | |
#output.write("*******************\n") | |
#output.write(title) | |
#output.write("\n") | |
#output.write("*******************\n") | |
for row1 in c1: | |
c2 = conn.cursor() | |
c2.execute("SELECT filename FROM %s WHERE id=? ORDER BY filename" % (dup_db), (row1[0],)) | |
for row2 in c2: | |
output.write(row2[0]) | |
output.write("\n") | |
c2.close() | |
output.write("\n") | |
output.write("\n") | |
#Clean up | |
conn.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment