Skip to content

Instantly share code, notes, and snippets.

@gaulinmp
Last active April 4, 2024 04:33
Show Gist options
  • Star 25 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save gaulinmp/da5825de975ed0ea6a24186434c24fe4 to your computer and use it in GitHub Desktop.
Save gaulinmp/da5825de975ed0ea6a24186434c24fe4 to your computer and use it in GitHub Desktop.
N-Gram Similarity Comparison
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:17.237596",
"start_time": "2017-03-21T18:07:16.630935"
},
"collapsed": true
},
"outputs": [],
"source": [
"# Get Tuple algorithms \n",
"import re\n",
"import math\n",
"import numpy as np\n",
"from itertools import chain\n",
"from collections import Counter\n",
"import nltk\n",
"from nltk.util import ngrams # This is the ngram magic.\n",
"from textblob import TextBlob\n",
"\n",
"NGRAM = 4\n",
"\n",
"re_sent_ends_naive = re.compile(r'[.\\n]')\n",
"re_stripper_alpha = re.compile('[^a-zA-Z]+')\n",
"re_stripper_naive = re.compile('[^a-zA-Z\\.\\n]')\n",
"\n",
"splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))\n",
"\n",
"sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')\n",
"\n",
"def get_tuples_nosentences(txt):\n",
" \"\"\"Get tuples that ignores all punctuation (including sentences).\"\"\"\n",
" if not txt: return None\n",
" ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)\n",
" return list(ng)\n",
"\n",
"def get_tuples_manual_sentences(txt):\n",
" \"\"\"Naive get tuples that uses periods or newlines to denote sentences.\"\"\"\n",
" if not txt: return None\n",
" sentences = (x.split() for x in splitter_naive(txt) if x)\n",
" ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)\n",
" return list(chain(*ng))\n",
"\n",
"def get_tuples_nltk_punkt_sentences(txt):\n",
" \"\"\"Get tuples that doesn't use textblob.\"\"\"\n",
" if not txt: return None\n",
" sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)\n",
" # Need to filter X because of empty 'words' from punctuation split\n",
" ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)\n",
" return list(chain(*ng))\n",
"\n",
"def get_tuples_textblob_sentences(txt):\n",
" \"\"\"New get_tuples that does use textblob.\"\"\"\n",
" if not txt: return None\n",
" tb = TextBlob(txt)\n",
" ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)\n",
" return [item for sublist in ng for item in sublist]\n",
"\n",
"def jaccard_distance(a, b):\n",
" \"\"\"Calculate the jaccard distance between sets A and B\"\"\"\n",
" a = set(a)\n",
" b = set(b)\n",
" return 1.0 * len(a&b)/len(a|b)\n",
"\n",
"def cosine_similarity_ngrams(a, b):\n",
" vec1 = Counter(a)\n",
" vec2 = Counter(b)\n",
" \n",
" intersection = set(vec1.keys()) & set(vec2.keys())\n",
" numerator = sum([vec1[x] * vec2[x] for x in intersection])\n",
"\n",
" sum1 = sum([vec1[x]**2 for x in vec1.keys()])\n",
" sum2 = sum([vec2[x]**2 for x in vec2.keys()])\n",
" denominator = math.sqrt(sum1) * math.sqrt(sum2)\n",
"\n",
" if not denominator:\n",
" return 0.0\n",
" return float(numerator) / denominator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test N-Gram functions"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:17.240952",
"start_time": "2017-03-21T18:07:17.238871"
},
"collapsed": true
},
"outputs": [],
"source": [
"paragraph = \"\"\"It was the best of times, it was the worst of times.\n",
" It was the age of wisdom? It was the age of foolishness!\n",
" I first met Dr. Frankenstein in Munich; his monster was, presumably, at home.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:17.417021",
"start_time": "2017-03-21T18:07:17.390647"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of N-grams: 34\n"
]
},
{
"data": {
"text/plain": [
"[('It', 'was', 'the', 'best'),\n",
" ('was', 'the', 'best', 'of'),\n",
" ('the', 'best', 'of', 'times'),\n",
" ('best', 'of', 'times', 'it'),\n",
" ('of', 'times', 'it', 'was'),\n",
" ('times', 'it', 'was', 'the'),\n",
" ('it', 'was', 'the', 'worst'),\n",
" ('was', 'the', 'worst', 'of'),\n",
" ('the', 'worst', 'of', 'times'),\n",
" ('worst', 'of', 'times', 'It'),\n",
" ('of', 'times', 'It', 'was'),\n",
" ('times', 'It', 'was', 'the'),\n",
" ('It', 'was', 'the', 'age'),\n",
" ('was', 'the', 'age', 'of'),\n",
" ('the', 'age', 'of', 'wisdom'),\n",
" ('age', 'of', 'wisdom', 'It'),\n",
" ('of', 'wisdom', 'It', 'was'),\n",
" ('wisdom', 'It', 'was', 'the'),\n",
" ('It', 'was', 'the', 'age'),\n",
" ('was', 'the', 'age', 'of'),\n",
" ('the', 'age', 'of', 'foolishness'),\n",
" ('age', 'of', 'foolishness', 'I'),\n",
" ('of', 'foolishness', 'I', 'first'),\n",
" ('foolishness', 'I', 'first', 'met'),\n",
" ('I', 'first', 'met', 'Dr'),\n",
" ('first', 'met', 'Dr', 'Frankenstein'),\n",
" ('met', 'Dr', 'Frankenstein', 'in'),\n",
" ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
" ('Frankenstein', 'in', 'Munich', 'his'),\n",
" ('in', 'Munich', 'his', 'monster'),\n",
" ('Munich', 'his', 'monster', 'was'),\n",
" ('his', 'monster', 'was', 'presumably'),\n",
" ('monster', 'was', 'presumably', 'at'),\n",
" ('was', 'presumably', 'at', 'home')]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_ = get_tuples_nosentences(paragraph);print(\"Number of N-grams:\", len(_));_"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:17.574897",
"start_time": "2017-03-21T18:07:17.564493"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of N-grams: 25\n"
]
},
{
"data": {
"text/plain": [
"[('It', 'was', 'the', 'best'),\n",
" ('was', 'the', 'best', 'of'),\n",
" ('the', 'best', 'of', 'times'),\n",
" ('best', 'of', 'times', 'it'),\n",
" ('of', 'times', 'it', 'was'),\n",
" ('times', 'it', 'was', 'the'),\n",
" ('it', 'was', 'the', 'worst'),\n",
" ('was', 'the', 'worst', 'of'),\n",
" ('the', 'worst', 'of', 'times'),\n",
" ('It', 'was', 'the', 'age'),\n",
" ('was', 'the', 'age', 'of'),\n",
" ('the', 'age', 'of', 'wisdom'),\n",
" ('age', 'of', 'wisdom', 'It'),\n",
" ('of', 'wisdom', 'It', 'was'),\n",
" ('wisdom', 'It', 'was', 'the'),\n",
" ('It', 'was', 'the', 'age'),\n",
" ('was', 'the', 'age', 'of'),\n",
" ('the', 'age', 'of', 'foolishness'),\n",
" ('I', 'first', 'met', 'Dr'),\n",
" ('Frankenstein', 'in', 'Munich', 'his'),\n",
" ('in', 'Munich', 'his', 'monster'),\n",
" ('Munich', 'his', 'monster', 'was'),\n",
" ('his', 'monster', 'was', 'presumably'),\n",
" ('monster', 'was', 'presumably', 'at'),\n",
" ('was', 'presumably', 'at', 'home')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_ = get_tuples_manual_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:17.918758",
"start_time": "2017-03-21T18:07:17.903904"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of N-grams: 25\n"
]
},
{
"data": {
"text/plain": [
"[('It', 'was', 'the', 'best'),\n",
" ('was', 'the', 'best', 'of'),\n",
" ('the', 'best', 'of', 'times'),\n",
" ('best', 'of', 'times', 'it'),\n",
" ('of', 'times', 'it', 'was'),\n",
" ('times', 'it', 'was', 'the'),\n",
" ('it', 'was', 'the', 'worst'),\n",
" ('was', 'the', 'worst', 'of'),\n",
" ('the', 'worst', 'of', 'times'),\n",
" ('It', 'was', 'the', 'age'),\n",
" ('was', 'the', 'age', 'of'),\n",
" ('the', 'age', 'of', 'wisdom'),\n",
" ('It', 'was', 'the', 'age'),\n",
" ('was', 'the', 'age', 'of'),\n",
" ('the', 'age', 'of', 'foolishness'),\n",
" ('I', 'first', 'met', 'Dr'),\n",
" ('first', 'met', 'Dr', 'Frankenstein'),\n",
" ('met', 'Dr', 'Frankenstein', 'in'),\n",
" ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
" ('Frankenstein', 'in', 'Munich', 'his'),\n",
" ('in', 'Munich', 'his', 'monster'),\n",
" ('Munich', 'his', 'monster', 'was'),\n",
" ('his', 'monster', 'was', 'presumably'),\n",
" ('monster', 'was', 'presumably', 'at'),\n",
" ('was', 'presumably', 'at', 'home')]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_ = get_tuples_nltk_punkt_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:18.235779",
"start_time": "2017-03-21T18:07:18.220070"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of N-grams: 25\n"
]
},
{
"data": {
"text/plain": [
"[('It', 'was', 'the', 'best'),\n",
" ('was', 'the', 'best', 'of'),\n",
" ('the', 'best', 'of', 'times'),\n",
" ('best', 'of', 'times', 'it'),\n",
" ('of', 'times', 'it', 'was'),\n",
" ('times', 'it', 'was', 'the'),\n",
" ('it', 'was', 'the', 'worst'),\n",
" ('was', 'the', 'worst', 'of'),\n",
" ('the', 'worst', 'of', 'times'),\n",
" ('It', 'was', 'the', 'age'),\n",
" ('was', 'the', 'age', 'of'),\n",
" ('the', 'age', 'of', 'wisdom'),\n",
" ('It', 'was', 'the', 'age'),\n",
" ('was', 'the', 'age', 'of'),\n",
" ('the', 'age', 'of', 'foolishness'),\n",
" ('I', 'first', 'met', 'Dr'),\n",
" ('first', 'met', 'Dr', 'Frankenstein'),\n",
" ('met', 'Dr', 'Frankenstein', 'in'),\n",
" ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
" ('Frankenstein', 'in', 'Munich', 'his'),\n",
" ('in', 'Munich', 'his', 'monster'),\n",
" ('Munich', 'his', 'monster', 'was'),\n",
" ('his', 'monster', 'was', 'presumably'),\n",
" ('monster', 'was', 'presumably', 'at'),\n",
" ('was', 'presumably', 'at', 'home')]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_ = get_tuples_textblob_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test Similarity"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:18.873284",
"start_time": "2017-03-21T18:07:18.865795"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jaccard: 0.0 Cosine: 0.0\n"
]
}
],
"source": [
"a = get_tuples_nosentences(\"It was the best of times.\")\n",
"b = get_tuples_nosentences(\"It was the worst of times.\")\n",
"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:19.188519",
"start_time": "2017-03-21T18:07:19.181992"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jaccard: 0.2 Cosine: 0.33333333333333337\n"
]
}
],
"source": [
"a = get_tuples_nosentences(\"Above is a bad example of four-gram similarity.\")\n",
"b = get_tuples_nosentences(\"This is a better example of four-gram similarity.\")\n",
"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2017-03-21T18:07:19.499491",
"start_time": "2017-03-21T18:07:19.493078"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jaccard: 0.14285714285714285 Cosine: 0.5714285714285714\n"
]
}
],
"source": [
"a = get_tuples_nosentences(\"Jaccard Index ignores repetition repetition repetition repetition repetition.\")\n",
"b = get_tuples_nosentences(\"Cosine similarity weighs repetition repetition repetition repetition repetition.\")\n",
"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# Get Tuple algorithms
import re
import math
import numpy as np
from itertools import chain
from collections import Counter
import nltk
from nltk.util import ngrams # This is the ngram magic.
from textblob import TextBlob
NGRAM = 4
re_sent_ends_naive = re.compile(r'[.\n]')
re_stripper_alpha = re.compile('[^a-zA-Z]+')
re_stripper_naive = re.compile('[^a-zA-Z\.\n]')
splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def get_tuples_nosentences(txt):
"""Get tuples that ignores all punctuation (including sentences)."""
if not txt: return None
ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)
return list(ng)
def get_tuples_manual_sentences(txt):
"""Naive get tuples that uses periods or newlines to denote sentences."""
if not txt: return None
sentences = (x.split() for x in splitter_naive(txt) if x)
ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)
return list(chain(*ng))
def get_tuples_nltk_punkt_sentences(txt):
"""Get tuples that doesn't use textblob."""
if not txt: return None
sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)
# Need to filter X because of empty 'words' from punctuation split
ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)
return list(chain(*ng))
def get_tuples_textblob_sentences(txt):
"""New get_tuples that does use textblob."""
if not txt: return None
tb = TextBlob(txt)
ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)
return [item for sublist in ng for item in sublist]
def jaccard_distance(a, b):
"""Calculate the jaccard distance between sets A and B"""
a = set(a)
b = set(b)
return 1.0 * len(a&b)/len(a|b)
def cosine_similarity_ngrams(a, b):
vec1 = Counter(a)
vec2 = Counter(b)
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
return float(numerator) / denominator
def test():
paragraph = """It was the best of times, it was the worst of times.
It was the age of wisdom? It was the age of foolishness!
I first met Dr. Frankenstein in Munich; his monster was, presumably, at home."""
print(paragraph)
_ = get_tuples_nosentences(paragraph);print("Number of N-grams (no sentences):", len(_));_
_ = get_tuples_manual_sentences(paragraph);print("Number of N-grams (naive sentences):", len(_));_
_ = get_tuples_nltk_punkt_sentences(paragraph);print("Number of N-grams (nltk sentences):", len(_));_
_ = get_tuples_textblob_sentences(paragraph);print("Number of N-grams (TextBlob sentences):", len(_));_
a = get_tuples_nosentences("It was the best of times.")
b = get_tuples_nosentences("It was the worst of times.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
a = get_tuples_nosentences("Above is a bad example of four-gram similarity.")
b = get_tuples_nosentences("This is a better example of four-gram similarity.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
a = get_tuples_nosentences("Jaccard Index ignores repetition repetition repetition repetition repetition.")
b = get_tuples_nosentences("Cosine similarity weighs repetition repetition repetition repetition repetition.")
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment