Skip to content

Instantly share code, notes, and snippets.

@kyleziegler
Created January 3, 2022 18:11
Show Gist options
  • Save kyleziegler/2db6f74c06219867425570f2a8725844 to your computer and use it in GitHub Desktop.
Save kyleziegler/2db6f74c06219867425570f2a8725844 to your computer and use it in GitHub Desktop.
tokenizer = nltk.tokenize.casual.TweetTokenizer()
def jaccard_similarity(list_x, list_y):
# Convert to set, capturing only unique values
set_x = set(list_x)
set_y = set(list_y)
intersection = set_x.intersection(set_y)
union = set_x.union(set_y)
# Prevent divide by 0 when there is an empty set, and return jaccard
# simularity
return len(intersection) / len(union) if len(union) > 0 else 0
def shingling_jaccard_similarity(text_x, text_y, n):
# Tokenize, then send to ngrams, otherwise you'll make ngrams at the character level. It
# is typical to tokenize text sentences at the word level in the area of NLP.
x_tok = tokenizer.tokenize(text_x)
y_tok = tokenizer.tokenize(text_y)
# Make ngrams, of length n that is passed in.
x = nltk.ngrams(list(x_tok), n)
y = nltk.ngrams(list(y_tok), n)
return jaccard_similarity(list(x),list(y))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment