Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
N-Gram Similarity Comparison
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# Get Tuple algorithms
import re
import math
import numpy as np
from itertools import chain
from collections import Counter
import nltk
from nltk.util import ngrams # This is the ngram magic.
from textblob import TextBlob
NGRAM = 4
re_sent_ends_naive = re.compile(r'[.\n]')
re_stripper_alpha = re.compile('[^a-zA-Z]+')
re_stripper_naive = re.compile('[^a-zA-Z\.\n]')
splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def get_tuples_nosentences(txt):
"""Get tuples that ignores all punctuation (including sentences)."""
if not txt: return None
ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)
return list(ng)
def get_tuples_manual_sentences(txt):
"""Naive get tuples that uses periods or newlines to denote sentences."""
if not txt: return None
sentences = (x.split() for x in splitter_naive(txt) if x)
ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)
return list(chain(*ng))
def get_tuples_nltk_punkt_sentences(txt):
"""Get tuples that doesn't use textblob."""
if not txt: return None
sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)
# Need to filter X because of empty 'words' from punctuation split
ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)
return list(chain(*ng))
def get_tuples_textblob_sentences(txt):
"""New get_tuples that does use textblob."""
if not txt: return None
tb = TextBlob(txt)
ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)
return [item for sublist in ng for item in sublist]
def jaccard_distance(a, b):
"""Calculate the jaccard distance between sets A and B"""
a = set(a)
b = set(b)
return 1.0 * len(a&b)/len(a|b)
def cosine_similarity_ngrams(a, b):
vec1 = Counter(a)
vec2 = Counter(b)
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
return float(numerator) / denominator
def test():
paragraph = """It was the best of times, it was the worst of times.
It was the age of wisdom? It was the age of foolishness!
I first met Dr. Frankenstein in Munich; his monster was, presumably, at home."""
print(paragraph)
_ = get_tuples_nosentences(paragraph);print("Number of N-grams (no sentences):", len(_));_
_ = get_tuples_manual_sentences(paragraph);print("Number of N-grams (naive sentences):", len(_));_
_ = get_tuples_nltk_punkt_sentences(paragraph);print("Number of N-grams (nltk sentences):", len(_));_
_ = get_tuples_textblob_sentences(paragraph);print("Number of N-grams (TextBlob sentences):", len(_));_
a = get_tuples_nosentences("It was the best of times.")
b = get_tuples_nosentences("It was the worst of times.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
a = get_tuples_nosentences("Above is a bad example of four-gram similarity.")
b = get_tuples_nosentences("This is a better example of four-gram similarity.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
a = get_tuples_nosentences("Jaccard Index ignores repetition repetition repetition repetition repetition.")
b = get_tuples_nosentences("Cosine similarity weighs repetition repetition repetition repetition repetition.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.