Created
January 3, 2022 18:11
-
-
Save kyleziegler/2db6f74c06219867425570f2a8725844 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tokenizer = nltk.tokenize.casual.TweetTokenizer() | |
def jaccard_similarity(list_x, list_y): | |
# Convert to set, capturing only unique values | |
set_x = set(list_x) | |
set_y = set(list_y) | |
intersection = set_x.intersection(set_y) | |
union = set_x.union(set_y) | |
# Prevent divide by 0 when there is an empty set, and return jaccard | |
# simularity | |
return len(intersection) / len(union) if len(union) > 0 else 0 | |
def shingling_jaccard_similarity(text_x, text_y, n): | |
# Tokenize, then send to ngrams, otherwise you'll make ngrams at the character level. It | |
# is typical to tokenize text sentences at the word level in the area of NLP. | |
x_tok = tokenizer.tokenize(text_x) | |
y_tok = tokenizer.tokenize(text_y) | |
# Make ngrams, of length n that is passed in. | |
x = nltk.ngrams(list(x_tok), n) | |
y = nltk.ngrams(list(y_tok), n) | |
return jaccard_similarity(list(x),list(y)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment