Skip to content

Instantly share code, notes, and snippets.

@TobiX
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TobiX/552c01a93af93465c572 to your computer and use it in GitHub Desktop.
Save TobiX/552c01a93af93465c572 to your computer and use it in GitHub Desktop.
Music duplicate finder
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os, sys
import acoustid
import xdg.BaseDirectory
import apsw
def setwal(db):
db.cursor().execute("pragma journal_mode=wal")
db.wal_autocheckpoint(10)
apsw.connection_hooks.append(setwal)
cache = xdg.BaseDirectory.save_cache_path('music-dup-finder') + "/cache.sqlite"
db = apsw.Connection(cache)
cursor = db.cursor()
row = cursor.execute("SELECT count(*) FROM sqlite_master WHERE type='table' AND name='files'").next()
if row[0] == 0:
cursor.execute("CREATE TABLE files (id INTEGER PRIMARY KEY, name TEXT NOT NULL, fingerprint TEXT NOT NULL)")
cursor.execute("CREATE UNIQUE INDEX files_name ON files (name)")
cursor.execute("CREATE INDEX files_fingerprint ON files (fingerprint)")
# New Fingerprints
for (path, _, files) in os.walk(unicode(sys.argv[1])):
p = os.path.abspath(path)
for f in files:
name = os.path.join(p, f)
if os.path.splitext(f)[1].lower() not in (u'.mp3', u'.ogg', u'.wav', u'.wma', u'.flac', u'.m4a', u'.mp4', u'.mp2', u'.mpg'):
continue
exist = cursor.execute("SELECT count(*) FROM files WHERE name = ?", (name,)).next()
if exist[0] == 1:
continue
try:
sys.stdout.write(u"Fingerprinting »%s«... " % (f,))
sys.stdout.flush()
_, fingerprint = acoustid.fingerprint_file(name.encode('utf-8'))
cursor.execute("INSERT INTO files (name, fingerprint) VALUES (?,?)", (name, fingerprint))
except acoustid.FingerprintGenerationError as e:
print u"ERR: %s" % (e,)
else:
print u"OK"
# Remove old fingerprints
cursor2 = db.cursor()
for f in cursor.execute("SELECT name, id FROM files"):
if not os.access(f[0], os.F_OK):
cursor2.execute("DELETE FROM files WHERE id = ?", (f[1],))
# Find duplicates
for dup in cursor.execute("SELECT count(*) AS no, fingerprint FROM files GROUP BY fingerprint HAVING no > 1"):
print "Possible duplicate:"
for f in cursor2.execute("SELECT name FROM files WHERE fingerprint = ?", (dup[1],)):
print u"\t%s" % f
db.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment