Skip to content

Instantly share code, notes, and snippets.

@gaulinmp
Last active April 4, 2024 04:33
N-Gram Similarity Comparison
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# Get Tuple algorithms
import re
import math
import numpy as np
from itertools import chain
from collections import Counter
import nltk
from nltk.util import ngrams # This is the ngram magic.
from textblob import TextBlob
NGRAM = 4
re_sent_ends_naive = re.compile(r'[.\n]')
re_stripper_alpha = re.compile('[^a-zA-Z]+')
re_stripper_naive = re.compile('[^a-zA-Z\.\n]')
splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def get_tuples_nosentences(txt):
"""Get tuples that ignores all punctuation (including sentences)."""
if not txt: return None
ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)
return list(ng)
def get_tuples_manual_sentences(txt):
"""Naive get tuples that uses periods or newlines to denote sentences."""
if not txt: return None
sentences = (x.split() for x in splitter_naive(txt) if x)
ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)
return list(chain(*ng))
def get_tuples_nltk_punkt_sentences(txt):
"""Get tuples that doesn't use textblob."""
if not txt: return None
sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)
# Need to filter X because of empty 'words' from punctuation split
ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)
return list(chain(*ng))
def get_tuples_textblob_sentences(txt):
"""New get_tuples that does use textblob."""
if not txt: return None
tb = TextBlob(txt)
ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)
return [item for sublist in ng for item in sublist]
def jaccard_distance(a, b):
"""Calculate the jaccard distance between sets A and B"""
a = set(a)
b = set(b)
return 1.0 * len(a&b)/len(a|b)
def cosine_similarity_ngrams(a, b):
vec1 = Counter(a)
vec2 = Counter(b)
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
return float(numerator) / denominator
def test():
paragraph = """It was the best of times, it was the worst of times.
It was the age of wisdom? It was the age of foolishness!
I first met Dr. Frankenstein in Munich; his monster was, presumably, at home."""
print(paragraph)
_ = get_tuples_nosentences(paragraph);print("Number of N-grams (no sentences):", len(_));_
_ = get_tuples_manual_sentences(paragraph);print("Number of N-grams (naive sentences):", len(_));_
_ = get_tuples_nltk_punkt_sentences(paragraph);print("Number of N-grams (nltk sentences):", len(_));_
_ = get_tuples_textblob_sentences(paragraph);print("Number of N-grams (TextBlob sentences):", len(_));_
a = get_tuples_nosentences("It was the best of times.")
b = get_tuples_nosentences("It was the worst of times.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
a = get_tuples_nosentences("Above is a bad example of four-gram similarity.")
b = get_tuples_nosentences("This is a better example of four-gram similarity.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
a = get_tuples_nosentences("Jaccard Index ignores repetition repetition repetition repetition repetition.")
b = get_tuples_nosentences("Cosine similarity weighs repetition repetition repetition repetition repetition.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment