dlemphers/Char2vec.ipynb

## Char2vec.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Char2Vec\n",
    "\n",
    "Dave Lemphers\n",
    "\n",
    "https://davelemphers.com"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The intuition behind this is based on the work done on [Word2Vec](https://en.wikipedia.org/wiki/Word2vec) and is an experiment to see if using character embedding is useful for situations where character counts don't achieve the goal."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "from IPython.display import display, Markdown"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## This simply takes a word and creates a sliding window context based on the start and end settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def embedded_word(word, sliding_window_start, sliding_window_end):\n",
    "    context = {}\n",
    "    \n",
    "    for sliding_window_size in range(sliding_window_start, sliding_window_end):\n",
    "        for window in range(0, len(word)):\n",
    "            current_window = word[window:window + sliding_window_size]\n",
    "            if(len(current_window) == sliding_window_size):\n",
    "                context[current_window] = 0\n",
    "    \n",
    "    return context\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## We can play with the sliding window hp to see the effect on certain words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'a': 0, 'c': 0, 't': 0}"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'at': 0, 'ca': 0}"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'a': 0, 'at': 0, 'c': 0, 'ca': 0, 't': 0}"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for configs in [(1,2), (2,3), (1,3)]:\n",
    "    display(embedded_word(\"cat\", *configs))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The idea here is to simply create an embedding matrix for the words. You can provide a richer vocabulary of words or simply provide the two words and it will construct a vocab from these"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def embedding_matrix(word1, word2, vocab=None, sliding_window_params=(1,3)):\n",
    "    \n",
    "    if not vocab:\n",
    "        vocab = [word1, word2]\n",
    "    \n",
    "    word1_emb = embedded_word(word1, *sliding_window_params)\n",
    "    word2_emb = embedded_word(word2, *sliding_window_params)\n",
    "    \n",
    "    vocab_emb = {}\n",
    "    for vocab_word in vocab:\n",
    "        vocab_emb.update(embedded_word(vocab_word, *sliding_window_params))\n",
    "    \n",
    "    vocab = list(set(list(vocab_emb.keys()))) + list(set(list(word1_emb.keys()) + list(word2_emb.keys())))\n",
    "#     vocab = list(set(list(vocab_emb.keys())))\n",
    "    \n",
    "    embedding = np.zeros((2, len(vocab)))\n",
    "    \n",
    "    for idx, word_emb in enumerate([word1_emb, word2_emb]):\n",
    "        for tok in word_emb.keys():\n",
    "            embedding[idx][vocab.index(tok)] += 1\n",
    "    \n",
    "    return embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "         0.],\n",
       "       [ 1.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "         0.]])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embedding_matrix(\"cat\", \"hat\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cosine similarity on the embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def vec_similarity(word1, word2, vocab=None, config=(1,3)):\n",
    "    \n",
    "    matrix = embedding_matrix(\n",
    "        word1, \n",
    "        word2, \n",
    "        vocab=vocab, \n",
    "        sliding_window_params=config\n",
    "    )\n",
    "    \n",
    "    return cosine_similarity([matrix[0]], [matrix[1]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tf-idf cosine similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tf_similarity(word1, word2, vocab=None, config=(1,3)):\n",
    "    \n",
    "    if not vocab:\n",
    "        vocab = [word1, word2]\n",
    "\n",
    "    tfidf_vectorizer = TfidfVectorizer(\n",
    "        analyzer=\"char\", \n",
    "        lowercase=True, \n",
    "        ngram_range=config\n",
    "    )\n",
    "\n",
    "    tfidf_vectorizer.fit_transform(vocab)\n",
    "\n",
    "    return cosine_similarity(\n",
    "        tfidf_vectorizer.transform([word1]),\n",
    "        tfidf_vectorizer.transform([word2]),\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Try out some canonical pairs and play with sliding window"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = []\n",
    "\n",
    "for test_pair in [\n",
    "    (\"bag\", \"bags\"),\n",
    "    (\"bag\", \"bad\"),\n",
    "    (\"bag\", \"gab\")\n",
    "]:\n",
    "    for test_method in [vec_similarity, tf_similarity]:\n",
    "        for test_window in [(1,2), (2,3), (1,3)]:\n",
    "            result = test_method(*test_pair, config=test_window)\n",
    "            results.append([test_method.__name__, test_pair, test_window, result[0][0]])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Method</th>\n",
       "      <th>Test Pair</th>\n",
       "      <th>Options</th>\n",
       "      <th>Result</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, bags)</td>\n",
       "      <td>(1, 2)</td>\n",
       "      <td>0.866025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, bags)</td>\n",
       "      <td>(2, 3)</td>\n",
       "      <td>0.816497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, bags)</td>\n",
       "      <td>(1, 3)</td>\n",
       "      <td>0.845154</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, bags)</td>\n",
       "      <td>(1, 2)</td>\n",
       "      <td>0.747407</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, bags)</td>\n",
       "      <td>(2, 3)</td>\n",
       "      <td>0.656973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, bags)</td>\n",
       "      <td>(1, 3)</td>\n",
       "      <td>0.709297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, bad)</td>\n",
       "      <td>(1, 2)</td>\n",
       "      <td>0.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, bad)</td>\n",
       "      <td>(2, 3)</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, bad)</td>\n",
       "      <td>(1, 3)</td>\n",
       "      <td>0.600000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, bad)</td>\n",
       "      <td>(1, 2)</td>\n",
       "      <td>0.431613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, bad)</td>\n",
       "      <td>(2, 3)</td>\n",
       "      <td>0.201993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, bad)</td>\n",
       "      <td>(1, 3)</td>\n",
       "      <td>0.336097</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, gab)</td>\n",
       "      <td>(1, 2)</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, gab)</td>\n",
       "      <td>(2, 3)</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>vec_similarity</td>\n",
       "      <td>(bag, gab)</td>\n",
       "      <td>(1, 3)</td>\n",
       "      <td>0.600000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, gab)</td>\n",
       "      <td>(1, 2)</td>\n",
       "      <td>0.431613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, gab)</td>\n",
       "      <td>(2, 3)</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>tf_similarity</td>\n",
       "      <td>(bag, gab)</td>\n",
       "      <td>(1, 3)</td>\n",
       "      <td>0.336097</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Method    Test Pair Options    Result\n",
       "0   vec_similarity  (bag, bags)  (1, 2)  0.866025\n",
       "1   vec_similarity  (bag, bags)  (2, 3)  0.816497\n",
       "2   vec_similarity  (bag, bags)  (1, 3)  0.845154\n",
       "3    tf_similarity  (bag, bags)  (1, 2)  0.747407\n",
       "4    tf_similarity  (bag, bags)  (2, 3)  0.656973\n",
       "5    tf_similarity  (bag, bags)  (1, 3)  0.709297\n",
       "6   vec_similarity   (bag, bad)  (1, 2)  0.666667\n",
       "7   vec_similarity   (bag, bad)  (2, 3)  0.500000\n",
       "8   vec_similarity   (bag, bad)  (1, 3)  0.600000\n",
       "9    tf_similarity   (bag, bad)  (1, 2)  0.431613\n",
       "10   tf_similarity   (bag, bad)  (2, 3)  0.201993\n",
       "11   tf_similarity   (bag, bad)  (1, 3)  0.336097\n",
       "12  vec_similarity   (bag, gab)  (1, 2)  1.000000\n",
       "13  vec_similarity   (bag, gab)  (2, 3)  0.000000\n",
       "14  vec_similarity   (bag, gab)  (1, 3)  0.600000\n",
       "15   tf_similarity   (bag, gab)  (1, 2)  0.431613\n",
       "16   tf_similarity   (bag, gab)  (2, 3)  0.000000\n",
       "17   tf_similarity   (bag, gab)  (1, 3)  0.336097"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(results, columns=[\"Method\", \"Test Pair\", \"Options\", \"Result\"])\n",
    "df.style.set_properties(**{'font-size':'10pt'})\n",
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Char2Vec\n",
	"\n",
	"Dave Lemphers\n",
	"\n",
	"https://davelemphers.com"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The intuition behind this is based on the work done on [Word2Vec](https://en.wikipedia.org/wiki/Word2vec) and is an experiment to see if using character embedding is useful for situations where character counts don't achieve the goal."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 55,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"\n",
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"from sklearn.metrics.pairwise import cosine_similarity\n",
	"\n",
	"from IPython.display import display, Markdown"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## This simply takes a word and creates a sliding window context based on the start and end settings"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def embedded_word(word, sliding_window_start, sliding_window_end):\n",
	" context = {}\n",
	" \n",
	" for sliding_window_size in range(sliding_window_start, sliding_window_end):\n",
	" for window in range(0, len(word)):\n",
	" current_window = word[window:window + sliding_window_size]\n",
	" if(len(current_window) == sliding_window_size):\n",
	" context[current_window] = 0\n",
	" \n",
	" return context\n",
	" "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## We can play with the sliding window hp to see the effect on certain words"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'a': 0, 'c': 0, 't': 0}"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"text/plain": [
	"{'at': 0, 'ca': 0}"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"text/plain": [
	"{'a': 0, 'at': 0, 'c': 0, 'ca': 0, 't': 0}"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"for configs in [(1,2), (2,3), (1,3)]:\n",
	" display(embedded_word(\"cat\", *configs))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## The idea here is to simply create an embedding matrix for the words. You can provide a richer vocabulary of words or simply provide the two words and it will construct a vocab from these"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def embedding_matrix(word1, word2, vocab=None, sliding_window_params=(1,3)):\n",
	" \n",
	" if not vocab:\n",
	" vocab = [word1, word2]\n",
	" \n",
	" word1_emb = embedded_word(word1, *sliding_window_params)\n",
	" word2_emb = embedded_word(word2, *sliding_window_params)\n",
	" \n",
	" vocab_emb = {}\n",
	" for vocab_word in vocab:\n",
	" vocab_emb.update(embedded_word(vocab_word, *sliding_window_params))\n",
	" \n",
	" vocab = list(set(list(vocab_emb.keys()))) + list(set(list(word1_emb.keys()) + list(word2_emb.keys())))\n",
	"# vocab = list(set(list(vocab_emb.keys())))\n",
	" \n",
	" embedding = np.zeros((2, len(vocab)))\n",
	" \n",
	" for idx, word_emb in enumerate([word1_emb, word2_emb]):\n",
	" for tok in word_emb.keys():\n",
	" embedding[idx][vocab.index(tok)] += 1\n",
	" \n",
	" return embedding"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,\n",
	" 0.],\n",
	" [ 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,\n",
	" 0.]])"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"embedding_matrix(\"cat\", \"hat\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Cosine similarity on the embeddings"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 64,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def vec_similarity(word1, word2, vocab=None, config=(1,3)):\n",
	" \n",
	" matrix = embedding_matrix(\n",
	" word1, \n",
	" word2, \n",
	" vocab=vocab, \n",
	" sliding_window_params=config\n",
	" )\n",
	" \n",
	" return cosine_similarity([matrix[0]], [matrix[1]])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Tf-idf cosine similarity"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 65,
	"metadata": {},
	"outputs": [],
	"source": [
	"def tf_similarity(word1, word2, vocab=None, config=(1,3)):\n",
	" \n",
	" if not vocab:\n",
	" vocab = [word1, word2]\n",
	"\n",
	" tfidf_vectorizer = TfidfVectorizer(\n",
	" analyzer=\"char\", \n",
	" lowercase=True, \n",
	" ngram_range=config\n",
	" )\n",
	"\n",
	" tfidf_vectorizer.fit_transform(vocab)\n",
	"\n",
	" return cosine_similarity(\n",
	" tfidf_vectorizer.transform([word1]),\n",
	" tfidf_vectorizer.transform([word2]),\n",
	" )"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Try out some canonical pairs and play with sliding window"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 66,
	"metadata": {},
	"outputs": [],
	"source": [
	"results = []\n",
	"\n",
	"for test_pair in [\n",
	" (\"bag\", \"bags\"),\n",
	" (\"bag\", \"bad\"),\n",
	" (\"bag\", \"gab\")\n",
	"]:\n",
	" for test_method in [vec_similarity, tf_similarity]:\n",
	" for test_window in [(1,2), (2,3), (1,3)]:\n",
	" result = test_method(*test_pair, config=test_window)\n",
	" results.append([test_method.__name__, test_pair, test_window, result[0][0]])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 67,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Method</th>\n",
	" <th>Test Pair</th>\n",
	" <th>Options</th>\n",
	" <th>Result</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, bags)</td>\n",
	" <td>(1, 2)</td>\n",
	" <td>0.866025</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, bags)</td>\n",
	" <td>(2, 3)</td>\n",
	" <td>0.816497</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, bags)</td>\n",
	" <td>(1, 3)</td>\n",
	" <td>0.845154</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, bags)</td>\n",
	" <td>(1, 2)</td>\n",
	" <td>0.747407</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, bags)</td>\n",
	" <td>(2, 3)</td>\n",
	" <td>0.656973</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, bags)</td>\n",
	" <td>(1, 3)</td>\n",
	" <td>0.709297</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, bad)</td>\n",
	" <td>(1, 2)</td>\n",
	" <td>0.666667</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, bad)</td>\n",
	" <td>(2, 3)</td>\n",
	" <td>0.500000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, bad)</td>\n",
	" <td>(1, 3)</td>\n",
	" <td>0.600000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, bad)</td>\n",
	" <td>(1, 2)</td>\n",
	" <td>0.431613</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>10</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, bad)</td>\n",
	" <td>(2, 3)</td>\n",
	" <td>0.201993</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, bad)</td>\n",
	" <td>(1, 3)</td>\n",
	" <td>0.336097</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>12</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, gab)</td>\n",
	" <td>(1, 2)</td>\n",
	" <td>1.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>13</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, gab)</td>\n",
	" <td>(2, 3)</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>14</th>\n",
	" <td>vec_similarity</td>\n",
	" <td>(bag, gab)</td>\n",
	" <td>(1, 3)</td>\n",
	" <td>0.600000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>15</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, gab)</td>\n",
	" <td>(1, 2)</td>\n",
	" <td>0.431613</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>16</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, gab)</td>\n",
	" <td>(2, 3)</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>17</th>\n",
	" <td>tf_similarity</td>\n",
	" <td>(bag, gab)</td>\n",
	" <td>(1, 3)</td>\n",
	" <td>0.336097</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Method Test Pair Options Result\n",
	"0 vec_similarity (bag, bags) (1, 2) 0.866025\n",
	"1 vec_similarity (bag, bags) (2, 3) 0.816497\n",
	"2 vec_similarity (bag, bags) (1, 3) 0.845154\n",
	"3 tf_similarity (bag, bags) (1, 2) 0.747407\n",
	"4 tf_similarity (bag, bags) (2, 3) 0.656973\n",
	"5 tf_similarity (bag, bags) (1, 3) 0.709297\n",
	"6 vec_similarity (bag, bad) (1, 2) 0.666667\n",
	"7 vec_similarity (bag, bad) (2, 3) 0.500000\n",
	"8 vec_similarity (bag, bad) (1, 3) 0.600000\n",
	"9 tf_similarity (bag, bad) (1, 2) 0.431613\n",
	"10 tf_similarity (bag, bad) (2, 3) 0.201993\n",
	"11 tf_similarity (bag, bad) (1, 3) 0.336097\n",
	"12 vec_similarity (bag, gab) (1, 2) 1.000000\n",
	"13 vec_similarity (bag, gab) (2, 3) 0.000000\n",
	"14 vec_similarity (bag, gab) (1, 3) 0.600000\n",
	"15 tf_similarity (bag, gab) (1, 2) 0.431613\n",
	"16 tf_similarity (bag, gab) (2, 3) 0.000000\n",
	"17 tf_similarity (bag, gab) (1, 3) 0.336097"
	]
	},
	"execution_count": 67,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.DataFrame(results, columns=[\"Method\", \"Test Pair\", \"Options\", \"Result\"])\n",
	"df.style.set_properties(**{'font-size':'10pt'})\n",
	"df"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}