Skip to content

Instantly share code, notes, and snippets.

@SandyRogers
Created March 17, 2018 17:34
Show Gist options
  • Save SandyRogers/a73fab3ffa96a8c2feb93f1a1a6069bd to your computer and use it in GitHub Desktop.
Save SandyRogers/a73fab3ffa96a8c2feb93f1a1a6069bd to your computer and use it in GitHub Desktop.
Custom similarity model for spaCy, taking maximum token similarity between two documents
import numpy as np
class SimilarityModel(object):
def __init__(self, token_similarity_accumulator=np.max):
self._accumulator = token_similarity_accumulator
def __call__(self, doc):
doc.user_hooks[‘similarity’] = self.doc_span_similarity
doc.user_span_hooks[‘similarity’] = self.doc_span_similarity
doc.user_token_hooks[‘similarity’] = self.token_similarity
def token_similarity(self, tok1, tok2):
return np.dot(tok1.vector, tok2.vector)
# usually you would divide by vector norms here too
def doc_span_similarity(self, obj1, obj2):
token_pairs = itertools.product(obj1, obj2)
similarities = [self.token_similarity(tok1, tok2) for tok1, tok2 in token_pairs]
return self._accumulator(similarities)
# To apply it
a = nlp('hello world')
b = nlp('greetings planet')
sim = SimilarityModel()
sim(a)
sim(b)
a.similarity(b)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment