Skip to content

Instantly share code, notes, and snippets.

@iwalton3
Last active May 20, 2020 22:03
Show Gist options
  • Save iwalton3/e4e15d2315999308e54a9f793405ce18 to your computer and use it in GitHub Desktop.
Save iwalton3/e4e15d2315999308e54a9f793405ce18 to your computer and use it in GitHub Desktop.
Chromaprint Duplicate Finder (License: MIT)
#!/usr/bin/env python3
import numpy as np
import numba
import json
@numba.jit(nopython=True)
def dist(listx: numba.types.uint32[:], listy: numba.types.uint32[:]):
covariance = 0
xlen = min(len(listx),len(listy))
if xlen < 50:
return 100
bl = [0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8]
for i in range(xlen):
comp = listx[i] ^ listy[i]
covariance += (bl[comp & 0xff] + bl[(comp >> 8) & 0xff] +
bl[(comp >> 16) & 0xff] + bl[(comp >> 24) & 0xff])**3
covariance = covariance / xlen
return covariance/32
uuids = [x[0:32] for x in open("Manifest-UUID-List")]
len_uuids = len(uuids)
def get_pf(filename):
try:
with open(filename, 'rb') as fh:
r_duration, r_fp = fh.readlines()
return (
np.array([int(x) for x in r_fp.split(b'=')[1].split(b',')], dtype=np.uint32),
int(r_duration.split(b'=')[1])
)
except:
return (None, None)
dup_sets = []
dup_dict = {}
item_list = [get_pf("fingerprints/" + x) for x in uuids]
for i in range(len_uuids):
f1, d1 = item_list[i]
if f1 is None:
continue
print("{}/{} {}%".format(i, len_uuids, i*100/len_uuids))
for j in range(i+1, len_uuids):
f2, d2 = item_list[j]
if f2 is None:
continue
if abs(d1-d2) < 10 and dist(f1, f2) < 2:
if uuids[i] in dup_dict:
dup_dict[uuids[i]].add(uuids[j])
dup_dict[uuids[j]] = dup_dict[uuids[i]]
elif uuids[j] in dup_dict:
dup_dict[uuids[j]].add(uuids[i])
dup_dict[uuids[i]] = dup_dict[uuids[j]]
else:
dup = {uuids[i], uuids[j]}
dup_sets.append(dup)
dup_dict[uuids[j]] = dup
dup_dict[uuids[i]] = dup
# Clean up transitive duplicates.
for key, value in dup_dict.items():
for alt_key in list(value):
if alt_key == key:
continue
else:
set_to_check = dup_dict[alt_key]
if set_to_check is not value:
value.update(dup_dict[alt_key])
try:
dup_sets.remove(dup_dict[alt_key])
except:
pass
dup_dict[alt_key] = value
with open("dups", "w") as fh:
json.dump([list(x) for x in dup_sets], fh)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment