Skip to content

Instantly share code, notes, and snippets.

@abeger
Created November 11, 2012 23:16
Show Gist options
  • Save abeger/4056670 to your computer and use it in GitHub Desktop.
Save abeger/4056670 to your computer and use it in GitHub Desktop.
Runs through a list of files and finds any that have duplicate md5 hashes
import md5
import datetime
def output_line(line):
timestamp = datetime.datetime.today().strftime("%Y-%d-%m %H:%M:%S")
print(timestamp + ': ' + line)
output_line("Starting program...")
list_file = open('mp3_list.txt','r')
counter = 0
hash_dict = {}
for line in list_file:
mp3_name = line.strip()
mp3_file = open(mp3_name,'r')
m = md5.new()
m.update(mp3_file.read())
mp3_file.close()
digest = m.hexdigest()
if digest not in hash_dict:
hash_dict[digest] = []
hash_dict[digest].append(mp3_name)
counter += 1
if counter % 100 == 0:
output_line(str(counter) + " files inspected, " + str(len(hash_dict)) +
" hashes found...")
list_file.close()
output_line("Writing duplicates.txt...")
dupe_file = open('duplicates.txt','w')
for hash in hash_dict:
if len(hash_dict[hash]) > 1:
dupe_file.write(' OR '.join(asset_list) + "\n")
dupe_file.close()
output_line("Finished.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment