Created
March 17, 2018 17:34
-
-
Save SandyRogers/a73fab3ffa96a8c2feb93f1a1a6069bd to your computer and use it in GitHub Desktop.
Custom similarity model for spaCy, taking maximum token similarity between two documents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
class SimilarityModel(object): | |
def __init__(self, token_similarity_accumulator=np.max): | |
self._accumulator = token_similarity_accumulator | |
def __call__(self, doc): | |
doc.user_hooks[‘similarity’] = self.doc_span_similarity | |
doc.user_span_hooks[‘similarity’] = self.doc_span_similarity | |
doc.user_token_hooks[‘similarity’] = self.token_similarity | |
def token_similarity(self, tok1, tok2): | |
return np.dot(tok1.vector, tok2.vector) | |
# usually you would divide by vector norms here too | |
def doc_span_similarity(self, obj1, obj2): | |
token_pairs = itertools.product(obj1, obj2) | |
similarities = [self.token_similarity(tok1, tok2) for tok1, tok2 in token_pairs] | |
return self._accumulator(similarities) | |
# To apply it | |
a = nlp('hello world') | |
b = nlp('greetings planet') | |
sim = SimilarityModel() | |
sim(a) | |
sim(b) | |
a.similarity(b) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment