Skip to content

Instantly share code, notes, and snippets.

@zimolzak
Created January 2, 2017 23:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zimolzak/9855164ac0dab6db38bc4794054d2e2f to your computer and use it in GitHub Desktop.
Save zimolzak/9855164ac0dab6db38bc4794054d2e2f to your computer and use it in GitHub Desktop.
Deduplicate MP3s and AACs
from subprocess import getoutput
import os
command_b = "find /Users/ajz/Music/iTunes/iTunes\ Media/Music"
#command_b = "find /Users/ajz/Music/iTunes/iTunes\ Media/Music/Weezer"
command_a = "find /Users/ajz/powerbook/Users/ajz/Music/iTunes/iTunes\ Music"
debug = False
########
def last_index(S, char):
indices = list(range(len(S)))
indices.reverse()
for i in indices:
if S[i] == char:
return i
return -1
def sha_path(pathname):
escaped = pathname.replace(' ', '\ ').replace("'", "\\'").replace("(", "\(").replace(")", "\)").replace(",", "\,").replace("&", "\&")
return getoutput("shasum " + escaped).split()[0]
def path2file(path):
return path[last_index(path, '/') + 1 : ]
def reference_lists(find_lines):
pathnames = []
sizes = []
for line in find_lines:
if 'mp3' in line or 'm4a' in line:
pathnames.append(line)
s = os.path.getsize(line)
sizes.append(s)
emptysha = ' '*40
return [pathnames, sizes]
########
find_lines_a = getoutput(command_a).splitlines()
[path_list_a, size_list_a] = reference_lists(find_lines_a)
######## Begin processing of B. Depends on these global vars: size_list_a, path_list_a.
find_lines_b = getoutput(command_b).splitlines()
ub = []
size_list_b = []
path_list_b = []
sha_list_a = []
sha_list_b = []
path_matching_a = []
for line in find_lines_b:
if 'mp3' in line or 'm4a' in line:
path_b = line
size_b = os.path.getsize(path_b)
sha_a = sha_b = ' '*40
if size_b in size_list_a:
path_a = path_list_a[size_list_a.index(size_b)]
path_matching_a.append(path_a)
sha_a = sha_path(path_a)
sha_b = sha_path(path_b)
if sha_a == sha_b:
ub.append(False)
else:
ub.append(True)
else: # size of file B is unique
ub.append(True)
path_matching_a.append(None)
size_list_b.append(size_b)
sha_list_a.append(sha_a)
sha_list_b.append(sha_b)
path_list_b.append(path_b)
######## print
for i, p in enumerate(path_list_b):
if sha_list_a[i] == ' '*40: # Element of B is unique in size
assert ub[i]
print(p)
elif sha_list_a[i] != ' '*40 and ub[i]: # same exact size different hash
print(p)
print(' #### ' + path_matching_a[i])
else:
if debug:
print([size_list_b[i], sha_list_a[i], sha_list_b[i], ub[i], path2file(p)])
else:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment