gaulinmp/similarity_example.ipynb

## similarity_example.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:17.237596",
     "start_time": "2017-03-21T18:07:16.630935"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get Tuple algorithms \n",
    "import re\n",
    "import math\n",
    "import numpy as np\n",
    "from itertools import chain\n",
    "from collections import Counter\n",
    "import nltk\n",
    "from nltk.util import ngrams # This is the ngram magic.\n",
    "from textblob import TextBlob\n",
    "\n",
    "NGRAM = 4\n",
    "\n",
    "re_sent_ends_naive = re.compile(r'[.\\n]')\n",
    "re_stripper_alpha = re.compile('[^a-zA-Z]+')\n",
    "re_stripper_naive = re.compile('[^a-zA-Z\\.\\n]')\n",
    "\n",
    "splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))\n",
    "\n",
    "sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')\n",
    "\n",
    "def get_tuples_nosentences(txt):\n",
    "    \"\"\"Get tuples that ignores all punctuation (including sentences).\"\"\"\n",
    "    if not txt: return None\n",
    "    ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)\n",
    "    return list(ng)\n",
    "\n",
    "def get_tuples_manual_sentences(txt):\n",
    "    \"\"\"Naive get tuples that uses periods or newlines to denote sentences.\"\"\"\n",
    "    if not txt: return None\n",
    "    sentences = (x.split() for x in splitter_naive(txt) if x)\n",
    "    ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)\n",
    "    return list(chain(*ng))\n",
    "\n",
    "def get_tuples_nltk_punkt_sentences(txt):\n",
    "    \"\"\"Get tuples that doesn't use textblob.\"\"\"\n",
    "    if not txt: return None\n",
    "    sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)\n",
    "    # Need to filter X because of empty 'words' from punctuation split\n",
    "    ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)\n",
    "    return list(chain(*ng))\n",
    "\n",
    "def get_tuples_textblob_sentences(txt):\n",
    "    \"\"\"New get_tuples that does use textblob.\"\"\"\n",
    "    if not txt: return None\n",
    "    tb = TextBlob(txt)\n",
    "    ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)\n",
    "    return [item for sublist in ng for item in sublist]\n",
    "\n",
    "def jaccard_distance(a, b):\n",
    "    \"\"\"Calculate the jaccard distance between sets A and B\"\"\"\n",
    "    a = set(a)\n",
    "    b = set(b)\n",
    "    return 1.0 * len(a&b)/len(a|b)\n",
    "\n",
    "def cosine_similarity_ngrams(a, b):\n",
    "    vec1 = Counter(a)\n",
    "    vec2 = Counter(b)\n",
    "    \n",
    "    intersection = set(vec1.keys()) & set(vec2.keys())\n",
    "    numerator = sum([vec1[x] * vec2[x] for x in intersection])\n",
    "\n",
    "    sum1 = sum([vec1[x]**2 for x in vec1.keys()])\n",
    "    sum2 = sum([vec2[x]**2 for x in vec2.keys()])\n",
    "    denominator = math.sqrt(sum1) * math.sqrt(sum2)\n",
    "\n",
    "    if not denominator:\n",
    "        return 0.0\n",
    "    return float(numerator) / denominator"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test N-Gram functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:17.240952",
     "start_time": "2017-03-21T18:07:17.238871"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "paragraph = \"\"\"It was the best of times, it was the worst of times.\n",
    "               It was the age of wisdom? It was the age of foolishness!\n",
    "               I first met Dr. Frankenstein in Munich; his monster was, presumably, at home.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:17.417021",
     "start_time": "2017-03-21T18:07:17.390647"
    },
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of N-grams: 34\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('It', 'was', 'the', 'best'),\n",
       " ('was', 'the', 'best', 'of'),\n",
       " ('the', 'best', 'of', 'times'),\n",
       " ('best', 'of', 'times', 'it'),\n",
       " ('of', 'times', 'it', 'was'),\n",
       " ('times', 'it', 'was', 'the'),\n",
       " ('it', 'was', 'the', 'worst'),\n",
       " ('was', 'the', 'worst', 'of'),\n",
       " ('the', 'worst', 'of', 'times'),\n",
       " ('worst', 'of', 'times', 'It'),\n",
       " ('of', 'times', 'It', 'was'),\n",
       " ('times', 'It', 'was', 'the'),\n",
       " ('It', 'was', 'the', 'age'),\n",
       " ('was', 'the', 'age', 'of'),\n",
       " ('the', 'age', 'of', 'wisdom'),\n",
       " ('age', 'of', 'wisdom', 'It'),\n",
       " ('of', 'wisdom', 'It', 'was'),\n",
       " ('wisdom', 'It', 'was', 'the'),\n",
       " ('It', 'was', 'the', 'age'),\n",
       " ('was', 'the', 'age', 'of'),\n",
       " ('the', 'age', 'of', 'foolishness'),\n",
       " ('age', 'of', 'foolishness', 'I'),\n",
       " ('of', 'foolishness', 'I', 'first'),\n",
       " ('foolishness', 'I', 'first', 'met'),\n",
       " ('I', 'first', 'met', 'Dr'),\n",
       " ('first', 'met', 'Dr', 'Frankenstein'),\n",
       " ('met', 'Dr', 'Frankenstein', 'in'),\n",
       " ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
       " ('Frankenstein', 'in', 'Munich', 'his'),\n",
       " ('in', 'Munich', 'his', 'monster'),\n",
       " ('Munich', 'his', 'monster', 'was'),\n",
       " ('his', 'monster', 'was', 'presumably'),\n",
       " ('monster', 'was', 'presumably', 'at'),\n",
       " ('was', 'presumably', 'at', 'home')]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_ = get_tuples_nosentences(paragraph);print(\"Number of N-grams:\", len(_));_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:17.574897",
     "start_time": "2017-03-21T18:07:17.564493"
    },
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of N-grams: 25\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('It', 'was', 'the', 'best'),\n",
       " ('was', 'the', 'best', 'of'),\n",
       " ('the', 'best', 'of', 'times'),\n",
       " ('best', 'of', 'times', 'it'),\n",
       " ('of', 'times', 'it', 'was'),\n",
       " ('times', 'it', 'was', 'the'),\n",
       " ('it', 'was', 'the', 'worst'),\n",
       " ('was', 'the', 'worst', 'of'),\n",
       " ('the', 'worst', 'of', 'times'),\n",
       " ('It', 'was', 'the', 'age'),\n",
       " ('was', 'the', 'age', 'of'),\n",
       " ('the', 'age', 'of', 'wisdom'),\n",
       " ('age', 'of', 'wisdom', 'It'),\n",
       " ('of', 'wisdom', 'It', 'was'),\n",
       " ('wisdom', 'It', 'was', 'the'),\n",
       " ('It', 'was', 'the', 'age'),\n",
       " ('was', 'the', 'age', 'of'),\n",
       " ('the', 'age', 'of', 'foolishness'),\n",
       " ('I', 'first', 'met', 'Dr'),\n",
       " ('Frankenstein', 'in', 'Munich', 'his'),\n",
       " ('in', 'Munich', 'his', 'monster'),\n",
       " ('Munich', 'his', 'monster', 'was'),\n",
       " ('his', 'monster', 'was', 'presumably'),\n",
       " ('monster', 'was', 'presumably', 'at'),\n",
       " ('was', 'presumably', 'at', 'home')]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_ = get_tuples_manual_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:17.918758",
     "start_time": "2017-03-21T18:07:17.903904"
    },
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of N-grams: 25\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('It', 'was', 'the', 'best'),\n",
       " ('was', 'the', 'best', 'of'),\n",
       " ('the', 'best', 'of', 'times'),\n",
       " ('best', 'of', 'times', 'it'),\n",
       " ('of', 'times', 'it', 'was'),\n",
       " ('times', 'it', 'was', 'the'),\n",
       " ('it', 'was', 'the', 'worst'),\n",
       " ('was', 'the', 'worst', 'of'),\n",
       " ('the', 'worst', 'of', 'times'),\n",
       " ('It', 'was', 'the', 'age'),\n",
       " ('was', 'the', 'age', 'of'),\n",
       " ('the', 'age', 'of', 'wisdom'),\n",
       " ('It', 'was', 'the', 'age'),\n",
       " ('was', 'the', 'age', 'of'),\n",
       " ('the', 'age', 'of', 'foolishness'),\n",
       " ('I', 'first', 'met', 'Dr'),\n",
       " ('first', 'met', 'Dr', 'Frankenstein'),\n",
       " ('met', 'Dr', 'Frankenstein', 'in'),\n",
       " ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
       " ('Frankenstein', 'in', 'Munich', 'his'),\n",
       " ('in', 'Munich', 'his', 'monster'),\n",
       " ('Munich', 'his', 'monster', 'was'),\n",
       " ('his', 'monster', 'was', 'presumably'),\n",
       " ('monster', 'was', 'presumably', 'at'),\n",
       " ('was', 'presumably', 'at', 'home')]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_ = get_tuples_nltk_punkt_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:18.235779",
     "start_time": "2017-03-21T18:07:18.220070"
    },
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of N-grams: 25\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('It', 'was', 'the', 'best'),\n",
       " ('was', 'the', 'best', 'of'),\n",
       " ('the', 'best', 'of', 'times'),\n",
       " ('best', 'of', 'times', 'it'),\n",
       " ('of', 'times', 'it', 'was'),\n",
       " ('times', 'it', 'was', 'the'),\n",
       " ('it', 'was', 'the', 'worst'),\n",
       " ('was', 'the', 'worst', 'of'),\n",
       " ('the', 'worst', 'of', 'times'),\n",
       " ('It', 'was', 'the', 'age'),\n",
       " ('was', 'the', 'age', 'of'),\n",
       " ('the', 'age', 'of', 'wisdom'),\n",
       " ('It', 'was', 'the', 'age'),\n",
       " ('was', 'the', 'age', 'of'),\n",
       " ('the', 'age', 'of', 'foolishness'),\n",
       " ('I', 'first', 'met', 'Dr'),\n",
       " ('first', 'met', 'Dr', 'Frankenstein'),\n",
       " ('met', 'Dr', 'Frankenstein', 'in'),\n",
       " ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
       " ('Frankenstein', 'in', 'Munich', 'his'),\n",
       " ('in', 'Munich', 'his', 'monster'),\n",
       " ('Munich', 'his', 'monster', 'was'),\n",
       " ('his', 'monster', 'was', 'presumably'),\n",
       " ('monster', 'was', 'presumably', 'at'),\n",
       " ('was', 'presumably', 'at', 'home')]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_ = get_tuples_textblob_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test Similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:18.873284",
     "start_time": "2017-03-21T18:07:18.865795"
    },
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Jaccard: 0.0   Cosine: 0.0\n"
     ]
    }
   ],
   "source": [
    "a = get_tuples_nosentences(\"It was the best of times.\")\n",
    "b = get_tuples_nosentences(\"It was the worst of times.\")\n",
    "print(\"Jaccard: {}   Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:19.188519",
     "start_time": "2017-03-21T18:07:19.181992"
    },
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Jaccard: 0.2   Cosine: 0.33333333333333337\n"
     ]
    }
   ],
   "source": [
    "a = get_tuples_nosentences(\"Above is a bad example of four-gram similarity.\")\n",
    "b = get_tuples_nosentences(\"This is a better example of four-gram similarity.\")\n",
    "print(\"Jaccard: {}   Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-03-21T18:07:19.499491",
     "start_time": "2017-03-21T18:07:19.493078"
    },
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Jaccard: 0.14285714285714285   Cosine: 0.5714285714285714\n"
     ]
    }
   ],
   "source": [
    "a = get_tuples_nosentences(\"Jaccard Index ignores repetition repetition repetition repetition repetition.\")\n",
    "b = get_tuples_nosentences(\"Cosine similarity weighs repetition repetition repetition repetition repetition.\")\n",
    "print(\"Jaccard: {}   Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## similarity_example.py
# Get Tuple algorithms
import re
import math
import numpy as np
from itertools import chain
from collections import Counter
import nltk
from nltk.util import ngrams # This is the ngram magic.
from textblob import TextBlob

NGRAM = 4

re_sent_ends_naive = re.compile(r'[.\n]')
re_stripper_alpha = re.compile('[^a-zA-Z]+')
re_stripper_naive = re.compile('[^a-zA-Z\.\n]')

splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def get_tuples_nosentences(txt):
    """Get tuples that ignores all punctuation (including sentences)."""
    if not txt: return None
    ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)
    return list(ng)

def get_tuples_manual_sentences(txt):
    """Naive get tuples that uses periods or newlines to denote sentences."""
    if not txt: return None
    sentences = (x.split() for x in splitter_naive(txt) if x)
    ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)
    return list(chain(*ng))

def get_tuples_nltk_punkt_sentences(txt):
    """Get tuples that doesn't use textblob."""
    if not txt: return None
    sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)
    # Need to filter X because of empty 'words' from punctuation split
    ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)
    return list(chain(*ng))

def get_tuples_textblob_sentences(txt):
    """New get_tuples that does use textblob."""
    if not txt: return None
    tb = TextBlob(txt)
    ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)
    return [item for sublist in ng for item in sublist]

def jaccard_distance(a, b):
    """Calculate the jaccard distance between sets A and B"""
    a = set(a)
    b = set(b)
    return 1.0 * len(a&b)/len(a|b)

def cosine_similarity_ngrams(a, b):
    vec1 = Counter(a)
    vec2 = Counter(b)

    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    return float(numerator) / denominator


def test():
    paragraph = """It was the best of times, it was the worst of times.
               It was the age of wisdom? It was the age of foolishness!
               I first met Dr. Frankenstein in Munich; his monster was, presumably, at home."""
    print(paragraph)
    _ = get_tuples_nosentences(paragraph);print("Number of N-grams (no sentences):", len(_));_

    _ = get_tuples_manual_sentences(paragraph);print("Number of N-grams (naive sentences):", len(_));_

    _ = get_tuples_nltk_punkt_sentences(paragraph);print("Number of N-grams (nltk sentences):", len(_));_

    _ = get_tuples_textblob_sentences(paragraph);print("Number of N-grams (TextBlob sentences):", len(_));_

    a = get_tuples_nosentences("It was the best of times.")
    b = get_tuples_nosentences("It was the worst of times.")
    print("Jaccard: {}   Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

    a = get_tuples_nosentences("Above is a bad example of four-gram similarity.")
    b = get_tuples_nosentences("This is a better example of four-gram similarity.")
    print("Jaccard: {}   Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

    a = get_tuples_nosentences("Jaccard Index ignores repetition repetition repetition repetition repetition.")
    b = get_tuples_nosentences("Cosine similarity weighs repetition repetition repetition repetition repetition.")
    print("Jaccard: {}   Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
test()
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:17.237596",
	"start_time": "2017-03-21T18:07:16.630935"
	},
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Get Tuple algorithms \n",
	"import re\n",
	"import math\n",
	"import numpy as np\n",
	"from itertools import chain\n",
	"from collections import Counter\n",
	"import nltk\n",
	"from nltk.util import ngrams # This is the ngram magic.\n",
	"from textblob import TextBlob\n",
	"\n",
	"NGRAM = 4\n",
	"\n",
	"re_sent_ends_naive = re.compile(r'[.\\n]')\n",
	"re_stripper_alpha = re.compile('[^a-zA-Z]+')\n",
	"re_stripper_naive = re.compile('[^a-zA-Z\\.\\n]')\n",
	"\n",
	"splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))\n",
	"\n",
	"sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')\n",
	"\n",
	"def get_tuples_nosentences(txt):\n",
	" \"\"\"Get tuples that ignores all punctuation (including sentences).\"\"\"\n",
	" if not txt: return None\n",
	" ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)\n",
	" return list(ng)\n",
	"\n",
	"def get_tuples_manual_sentences(txt):\n",
	" \"\"\"Naive get tuples that uses periods or newlines to denote sentences.\"\"\"\n",
	" if not txt: return None\n",
	" sentences = (x.split() for x in splitter_naive(txt) if x)\n",
	" ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)\n",
	" return list(chain(*ng))\n",
	"\n",
	"def get_tuples_nltk_punkt_sentences(txt):\n",
	" \"\"\"Get tuples that doesn't use textblob.\"\"\"\n",
	" if not txt: return None\n",
	" sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)\n",
	" # Need to filter X because of empty 'words' from punctuation split\n",
	" ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)\n",
	" return list(chain(*ng))\n",
	"\n",
	"def get_tuples_textblob_sentences(txt):\n",
	" \"\"\"New get_tuples that does use textblob.\"\"\"\n",
	" if not txt: return None\n",
	" tb = TextBlob(txt)\n",
	" ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)\n",
	" return [item for sublist in ng for item in sublist]\n",
	"\n",
	"def jaccard_distance(a, b):\n",
	" \"\"\"Calculate the jaccard distance between sets A and B\"\"\"\n",
	" a = set(a)\n",
	" b = set(b)\n",
	" return 1.0 * len(a&b)/len(a\|b)\n",
	"\n",
	"def cosine_similarity_ngrams(a, b):\n",
	" vec1 = Counter(a)\n",
	" vec2 = Counter(b)\n",
	" \n",
	" intersection = set(vec1.keys()) & set(vec2.keys())\n",
	" numerator = sum([vec1[x] * vec2[x] for x in intersection])\n",
	"\n",
	" sum1 = sum([vec1[x]**2 for x in vec1.keys()])\n",
	" sum2 = sum([vec2[x]**2 for x in vec2.keys()])\n",
	" denominator = math.sqrt(sum1) * math.sqrt(sum2)\n",
	"\n",
	" if not denominator:\n",
	" return 0.0\n",
	" return float(numerator) / denominator"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Test N-Gram functions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:17.240952",
	"start_time": "2017-03-21T18:07:17.238871"
	},
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"paragraph = \"\"\"It was the best of times, it was the worst of times.\n",
	" It was the age of wisdom? It was the age of foolishness!\n",
	" I first met Dr. Frankenstein in Munich; his monster was, presumably, at home.\"\"\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:17.417021",
	"start_time": "2017-03-21T18:07:17.390647"
	},
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of N-grams: 34\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"[('It', 'was', 'the', 'best'),\n",
	" ('was', 'the', 'best', 'of'),\n",
	" ('the', 'best', 'of', 'times'),\n",
	" ('best', 'of', 'times', 'it'),\n",
	" ('of', 'times', 'it', 'was'),\n",
	" ('times', 'it', 'was', 'the'),\n",
	" ('it', 'was', 'the', 'worst'),\n",
	" ('was', 'the', 'worst', 'of'),\n",
	" ('the', 'worst', 'of', 'times'),\n",
	" ('worst', 'of', 'times', 'It'),\n",
	" ('of', 'times', 'It', 'was'),\n",
	" ('times', 'It', 'was', 'the'),\n",
	" ('It', 'was', 'the', 'age'),\n",
	" ('was', 'the', 'age', 'of'),\n",
	" ('the', 'age', 'of', 'wisdom'),\n",
	" ('age', 'of', 'wisdom', 'It'),\n",
	" ('of', 'wisdom', 'It', 'was'),\n",
	" ('wisdom', 'It', 'was', 'the'),\n",
	" ('It', 'was', 'the', 'age'),\n",
	" ('was', 'the', 'age', 'of'),\n",
	" ('the', 'age', 'of', 'foolishness'),\n",
	" ('age', 'of', 'foolishness', 'I'),\n",
	" ('of', 'foolishness', 'I', 'first'),\n",
	" ('foolishness', 'I', 'first', 'met'),\n",
	" ('I', 'first', 'met', 'Dr'),\n",
	" ('first', 'met', 'Dr', 'Frankenstein'),\n",
	" ('met', 'Dr', 'Frankenstein', 'in'),\n",
	" ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
	" ('Frankenstein', 'in', 'Munich', 'his'),\n",
	" ('in', 'Munich', 'his', 'monster'),\n",
	" ('Munich', 'his', 'monster', 'was'),\n",
	" ('his', 'monster', 'was', 'presumably'),\n",
	" ('monster', 'was', 'presumably', 'at'),\n",
	" ('was', 'presumably', 'at', 'home')]"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"_ = get_tuples_nosentences(paragraph);print(\"Number of N-grams:\", len(_));_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:17.574897",
	"start_time": "2017-03-21T18:07:17.564493"
	},
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of N-grams: 25\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"[('It', 'was', 'the', 'best'),\n",
	" ('was', 'the', 'best', 'of'),\n",
	" ('the', 'best', 'of', 'times'),\n",
	" ('best', 'of', 'times', 'it'),\n",
	" ('of', 'times', 'it', 'was'),\n",
	" ('times', 'it', 'was', 'the'),\n",
	" ('it', 'was', 'the', 'worst'),\n",
	" ('was', 'the', 'worst', 'of'),\n",
	" ('the', 'worst', 'of', 'times'),\n",
	" ('It', 'was', 'the', 'age'),\n",
	" ('was', 'the', 'age', 'of'),\n",
	" ('the', 'age', 'of', 'wisdom'),\n",
	" ('age', 'of', 'wisdom', 'It'),\n",
	" ('of', 'wisdom', 'It', 'was'),\n",
	" ('wisdom', 'It', 'was', 'the'),\n",
	" ('It', 'was', 'the', 'age'),\n",
	" ('was', 'the', 'age', 'of'),\n",
	" ('the', 'age', 'of', 'foolishness'),\n",
	" ('I', 'first', 'met', 'Dr'),\n",
	" ('Frankenstein', 'in', 'Munich', 'his'),\n",
	" ('in', 'Munich', 'his', 'monster'),\n",
	" ('Munich', 'his', 'monster', 'was'),\n",
	" ('his', 'monster', 'was', 'presumably'),\n",
	" ('monster', 'was', 'presumably', 'at'),\n",
	" ('was', 'presumably', 'at', 'home')]"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"_ = get_tuples_manual_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:17.918758",
	"start_time": "2017-03-21T18:07:17.903904"
	},
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of N-grams: 25\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"[('It', 'was', 'the', 'best'),\n",
	" ('was', 'the', 'best', 'of'),\n",
	" ('the', 'best', 'of', 'times'),\n",
	" ('best', 'of', 'times', 'it'),\n",
	" ('of', 'times', 'it', 'was'),\n",
	" ('times', 'it', 'was', 'the'),\n",
	" ('it', 'was', 'the', 'worst'),\n",
	" ('was', 'the', 'worst', 'of'),\n",
	" ('the', 'worst', 'of', 'times'),\n",
	" ('It', 'was', 'the', 'age'),\n",
	" ('was', 'the', 'age', 'of'),\n",
	" ('the', 'age', 'of', 'wisdom'),\n",
	" ('It', 'was', 'the', 'age'),\n",
	" ('was', 'the', 'age', 'of'),\n",
	" ('the', 'age', 'of', 'foolishness'),\n",
	" ('I', 'first', 'met', 'Dr'),\n",
	" ('first', 'met', 'Dr', 'Frankenstein'),\n",
	" ('met', 'Dr', 'Frankenstein', 'in'),\n",
	" ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
	" ('Frankenstein', 'in', 'Munich', 'his'),\n",
	" ('in', 'Munich', 'his', 'monster'),\n",
	" ('Munich', 'his', 'monster', 'was'),\n",
	" ('his', 'monster', 'was', 'presumably'),\n",
	" ('monster', 'was', 'presumably', 'at'),\n",
	" ('was', 'presumably', 'at', 'home')]"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"_ = get_tuples_nltk_punkt_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:18.235779",
	"start_time": "2017-03-21T18:07:18.220070"
	},
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of N-grams: 25\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"[('It', 'was', 'the', 'best'),\n",
	" ('was', 'the', 'best', 'of'),\n",
	" ('the', 'best', 'of', 'times'),\n",
	" ('best', 'of', 'times', 'it'),\n",
	" ('of', 'times', 'it', 'was'),\n",
	" ('times', 'it', 'was', 'the'),\n",
	" ('it', 'was', 'the', 'worst'),\n",
	" ('was', 'the', 'worst', 'of'),\n",
	" ('the', 'worst', 'of', 'times'),\n",
	" ('It', 'was', 'the', 'age'),\n",
	" ('was', 'the', 'age', 'of'),\n",
	" ('the', 'age', 'of', 'wisdom'),\n",
	" ('It', 'was', 'the', 'age'),\n",
	" ('was', 'the', 'age', 'of'),\n",
	" ('the', 'age', 'of', 'foolishness'),\n",
	" ('I', 'first', 'met', 'Dr'),\n",
	" ('first', 'met', 'Dr', 'Frankenstein'),\n",
	" ('met', 'Dr', 'Frankenstein', 'in'),\n",
	" ('Dr', 'Frankenstein', 'in', 'Munich'),\n",
	" ('Frankenstein', 'in', 'Munich', 'his'),\n",
	" ('in', 'Munich', 'his', 'monster'),\n",
	" ('Munich', 'his', 'monster', 'was'),\n",
	" ('his', 'monster', 'was', 'presumably'),\n",
	" ('monster', 'was', 'presumably', 'at'),\n",
	" ('was', 'presumably', 'at', 'home')]"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"_ = get_tuples_textblob_sentences(paragraph);print(\"Number of N-grams:\", len(_));_"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Test Similarity"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:18.873284",
	"start_time": "2017-03-21T18:07:18.865795"
	},
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Jaccard: 0.0 Cosine: 0.0\n"
	]
	}
	],
	"source": [
	"a = get_tuples_nosentences(\"It was the best of times.\")\n",
	"b = get_tuples_nosentences(\"It was the worst of times.\")\n",
	"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:19.188519",
	"start_time": "2017-03-21T18:07:19.181992"
	},
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Jaccard: 0.2 Cosine: 0.33333333333333337\n"
	]
	}
	],
	"source": [
	"a = get_tuples_nosentences(\"Above is a bad example of four-gram similarity.\")\n",
	"b = get_tuples_nosentences(\"This is a better example of four-gram similarity.\")\n",
	"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-03-21T18:07:19.499491",
	"start_time": "2017-03-21T18:07:19.493078"
	},
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Jaccard: 0.14285714285714285 Cosine: 0.5714285714285714\n"
	]
	}
	],
	"source": [
	"a = get_tuples_nosentences(\"Jaccard Index ignores repetition repetition repetition repetition repetition.\")\n",
	"b = get_tuples_nosentences(\"Cosine similarity weighs repetition repetition repetition repetition repetition.\")\n",
	"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
	# Get Tuple algorithms
	import re
	import math
	import numpy as np
	from itertools import chain
	from collections import Counter
	import nltk
	from nltk.util import ngrams # This is the ngram magic.
	from textblob import TextBlob

	NGRAM = 4

	re_sent_ends_naive = re.compile(r'[.\n]')
	re_stripper_alpha = re.compile('[^a-zA-Z]+')
	re_stripper_naive = re.compile('[^a-zA-Z\.\n]')

	splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))

	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

	def get_tuples_nosentences(txt):
	"""Get tuples that ignores all punctuation (including sentences)."""
	if not txt: return None
	ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)
	return list(ng)

	def get_tuples_manual_sentences(txt):
	"""Naive get tuples that uses periods or newlines to denote sentences."""
	if not txt: return None
	sentences = (x.split() for x in splitter_naive(txt) if x)
	ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)
	return list(chain(*ng))

	def get_tuples_nltk_punkt_sentences(txt):
	"""Get tuples that doesn't use textblob."""
	if not txt: return None
	sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)
	# Need to filter X because of empty 'words' from punctuation split
	ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)
	return list(chain(*ng))

	def get_tuples_textblob_sentences(txt):
	"""New get_tuples that does use textblob."""
	if not txt: return None
	tb = TextBlob(txt)
	ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)
	return [item for sublist in ng for item in sublist]

	def jaccard_distance(a, b):
	"""Calculate the jaccard distance between sets A and B"""
	a = set(a)
	b = set(b)
	return 1.0 * len(a&b)/len(a\|b)

	def cosine_similarity_ngrams(a, b):
	vec1 = Counter(a)
	vec2 = Counter(b)

	intersection = set(vec1.keys()) & set(vec2.keys())
	numerator = sum([vec1[x] * vec2[x] for x in intersection])

	sum1 = sum([vec1[x]**2 for x in vec1.keys()])
	sum2 = sum([vec2[x]**2 for x in vec2.keys()])
	denominator = math.sqrt(sum1) * math.sqrt(sum2)

	if not denominator:
	return 0.0
	return float(numerator) / denominator


	def test():
	paragraph = """It was the best of times, it was the worst of times.
	It was the age of wisdom? It was the age of foolishness!
	I first met Dr. Frankenstein in Munich; his monster was, presumably, at home."""
	print(paragraph)
	_ = get_tuples_nosentences(paragraph);print("Number of N-grams (no sentences):", len(_));_

	_ = get_tuples_manual_sentences(paragraph);print("Number of N-grams (naive sentences):", len(_));_

	_ = get_tuples_nltk_punkt_sentences(paragraph);print("Number of N-grams (nltk sentences):", len(_));_

	_ = get_tuples_textblob_sentences(paragraph);print("Number of N-grams (TextBlob sentences):", len(_));_

	a = get_tuples_nosentences("It was the best of times.")
	b = get_tuples_nosentences("It was the worst of times.")
	print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

	a = get_tuples_nosentences("Above is a bad example of four-gram similarity.")
	b = get_tuples_nosentences("This is a better example of four-gram similarity.")
	print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

	a = get_tuples_nosentences("Jaccard Index ignores repetition repetition repetition repetition repetition.")
	b = get_tuples_nosentences("Cosine similarity weighs repetition repetition repetition repetition repetition.")
	print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
	test()