Created
November 11, 2012 23:16
-
-
Save abeger/4056670 to your computer and use it in GitHub Desktop.
Runs through a list of files and finds any that have duplicate md5 hashes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import md5 | |
import datetime | |
def output_line(line): | |
timestamp = datetime.datetime.today().strftime("%Y-%d-%m %H:%M:%S") | |
print(timestamp + ': ' + line) | |
output_line("Starting program...") | |
list_file = open('mp3_list.txt','r') | |
counter = 0 | |
hash_dict = {} | |
for line in list_file: | |
mp3_name = line.strip() | |
mp3_file = open(mp3_name,'r') | |
m = md5.new() | |
m.update(mp3_file.read()) | |
mp3_file.close() | |
digest = m.hexdigest() | |
if digest not in hash_dict: | |
hash_dict[digest] = [] | |
hash_dict[digest].append(mp3_name) | |
counter += 1 | |
if counter % 100 == 0: | |
output_line(str(counter) + " files inspected, " + str(len(hash_dict)) + | |
" hashes found...") | |
list_file.close() | |
output_line("Writing duplicates.txt...") | |
dupe_file = open('duplicates.txt','w') | |
for hash in hash_dict: | |
if len(hash_dict[hash]) > 1: | |
dupe_file.write(' OR '.join(asset_list) + "\n") | |
dupe_file.close() | |
output_line("Finished.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment