Skip to content

Instantly share code, notes, and snippets.

@AaronJackson
Last active October 2, 2022 23:07
Show Gist options
  • Save AaronJackson/918b8d6afcfdcd689f294e5ab2c1ad96 to your computer and use it in GitHub Desktop.
Save AaronJackson/918b8d6afcfdcd689f294e5ab2c1ad96 to your computer and use it in GitHub Desktop.
from rank_bm25 import BM25Okapi
import glob, os
import numpy as np
import cv2
import string
files = []
corpus = []
similarity = []
for file in glob.glob("*.txt"):
files.append(file)
for file in files:
with open(file, 'r') as fid:
t = fid.read().replace('\n', ' ').upper()
t = t.translate(str.maketrans('', '', string.punctuation))
t = t.translate(str.maketrans('', '', string.digits))
t = t.replace('PAT', '').replace('RCA', '').replace('DECCA', '').replace('VICTOR', '').replace('COLUMBIA', '')
corpus.append(t)
tokens = [doc.split(" ") for doc in corpus]
tokens = [ [t for t in token if t] for token in tokens] # remove empty strings
tokens = [ [t for t in token if len(t) > 3] for token in tokens] # remove short grams
bm25 = BM25Okapi(tokens)
for doc in corpus:
query = doc.split(" ")
query = [q for q in query if q]
similarity.append(bm25.get_scores(query))
sim = np.array(similarity)
np.fill_diagonal(sim, 0) # remove n,n cases
sim = np.tril(sim) # lower triagonal
import csv
with open('pysimilarity.csv', 'w') as f:
writer = csv.writer(f)
for row in sim:
writer.writerow(row)
# possible duplicates
rows,cols = np.where(sim > 60)
for i in range(0, len(rows)):
al = len(tokens[rows[i]])
bl = len(tokens[cols[i]])
if abs(al - bl) > 100:
continue # probably info sheet
score = sim[rows[i],cols[i]]
print(str(score) + " " + files[rows[i]] + " may match " + files[cols[i]])
a = cv2.imread(files[rows[i]][:-4])
a2= cv2.resize(a, dsize=(768,768))
b = cv2.imread(files[cols[i]][:-4])
b2= cv2.resize(b, dsize=(768,768))
combined = np.concatenate((a2,b2), axis=1)
cv2.imwrite('combined/' + str(score) + "_" + str(rows[i]) + '_' + str(cols[i]) + '.jpg', combined)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment