jrjames83/query_string_clusters.ipynb

## query_string_clusters.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "pd.set_option('display.max_colwidth',1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get the data\n",
    "`node_a` and `node_b` are query strings and `edge_weight` is the number of times that they occurred in the same session\n",
    "\n",
    "#### _Note: method here adapted from work by colleague Ryan Carr (thanks!)_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_a</th>\n",
       "      <th>node_b</th>\n",
       "      <th>edge_weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>memorial day weekend</td>\n",
       "      <td>memorial day weekend events</td>\n",
       "      <td>4976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>memorial day weekend events</td>\n",
       "      <td>memorial day weekend</td>\n",
       "      <td>4976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>memorial day events</td>\n",
       "      <td>memorial day weekend events</td>\n",
       "      <td>3164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>memorial day weekend events</td>\n",
       "      <td>memorial day events</td>\n",
       "      <td>3164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>memorial day events</td>\n",
       "      <td>memorial day weekend</td>\n",
       "      <td>1969</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>memorial day weekend</td>\n",
       "      <td>memorial day events</td>\n",
       "      <td>1969</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>job fair</td>\n",
       "      <td>job fairs</td>\n",
       "      <td>1331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>job fairs</td>\n",
       "      <td>job fair</td>\n",
       "      <td>1331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>car show</td>\n",
       "      <td>car shows</td>\n",
       "      <td>1287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>car shows</td>\n",
       "      <td>car show</td>\n",
       "      <td>1287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>car show</td>\n",
       "      <td>classic car show</td>\n",
       "      <td>1231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>classic car show</td>\n",
       "      <td>car show</td>\n",
       "      <td>1231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>job fair</td>\n",
       "      <td>career fair</td>\n",
       "      <td>1171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>career fair</td>\n",
       "      <td>job fair</td>\n",
       "      <td>1171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>cinco de mayo</td>\n",
       "      <td>cinco de mayo party</td>\n",
       "      <td>1073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>cinco de mayo party</td>\n",
       "      <td>cinco de mayo</td>\n",
       "      <td>1073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>party</td>\n",
       "      <td>day party</td>\n",
       "      <td>998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>day party</td>\n",
       "      <td>party</td>\n",
       "      <td>998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>cinco de mayo party</td>\n",
       "      <td>cinco de mayo events</td>\n",
       "      <td>948</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>cinco de mayo events</td>\n",
       "      <td>cinco de mayo party</td>\n",
       "      <td>948</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         node_a                       node_b  edge_weight\n",
       "0          memorial day weekend  memorial day weekend events         4976\n",
       "1   memorial day weekend events         memorial day weekend         4976\n",
       "2           memorial day events  memorial day weekend events         3164\n",
       "3   memorial day weekend events          memorial day events         3164\n",
       "4           memorial day events         memorial day weekend         1969\n",
       "5          memorial day weekend          memorial day events         1969\n",
       "6                      job fair                    job fairs         1331\n",
       "7                     job fairs                     job fair         1331\n",
       "8                      car show                    car shows         1287\n",
       "9                     car shows                     car show         1287\n",
       "10                     car show             classic car show         1231\n",
       "11             classic car show                     car show         1231\n",
       "12                     job fair                  career fair         1171\n",
       "13                  career fair                     job fair         1171\n",
       "14                cinco de mayo          cinco de mayo party         1073\n",
       "15          cinco de mayo party                cinco de mayo         1073\n",
       "16                        party                    day party          998\n",
       "17                    day party                        party          998\n",
       "18          cinco de mayo party         cinco de mayo events          948\n",
       "19         cinco de mayo events          cinco de mayo party          948"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "synonym_graph = pd.read_csv('data/synonyms_by_session.csv', names=['node_a', 'node_b', 'edge_weight'])\n",
    "synonym_graph.head(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Normalize the edge weight\n",
    "Goal: create a notion of edge_weight that discounts trivial \"popular query\" relationships.\n",
    "\n",
    "`norm_edge_weight = edge_weight / node_b_count`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_a</th>\n",
       "      <th>node_b</th>\n",
       "      <th>edge_weight</th>\n",
       "      <th>node_b_count</th>\n",
       "      <th>norm_edge_weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8431</th>\n",
       "      <td>memorial day events</td>\n",
       "      <td>memorial day weekend events</td>\n",
       "      <td>3164</td>\n",
       "      <td>10</td>\n",
       "      <td>316.400000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4596</th>\n",
       "      <td>earth day festival</td>\n",
       "      <td>earth day events</td>\n",
       "      <td>412</td>\n",
       "      <td>2</td>\n",
       "      <td>206.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8821</th>\n",
       "      <td>mother</td>\n",
       "      <td>mothers day</td>\n",
       "      <td>617</td>\n",
       "      <td>3</td>\n",
       "      <td>205.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8432</th>\n",
       "      <td>memorial day events</td>\n",
       "      <td>memorial day weekend</td>\n",
       "      <td>1969</td>\n",
       "      <td>10</td>\n",
       "      <td>196.900000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9045</th>\n",
       "      <td>mothers day brunch</td>\n",
       "      <td>mothers day</td>\n",
       "      <td>534</td>\n",
       "      <td>3</td>\n",
       "      <td>178.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5229</th>\n",
       "      <td>father</td>\n",
       "      <td>fathers day</td>\n",
       "      <td>174</td>\n",
       "      <td>1</td>\n",
       "      <td>174.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14369</th>\n",
       "      <td>yog</td>\n",
       "      <td>yoga</td>\n",
       "      <td>159</td>\n",
       "      <td>1</td>\n",
       "      <td>159.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2474</th>\n",
       "      <td>carnival dates</td>\n",
       "      <td>carnival</td>\n",
       "      <td>158</td>\n",
       "      <td>1</td>\n",
       "      <td>158.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3793</th>\n",
       "      <td>curl fest</td>\n",
       "      <td>curlfest</td>\n",
       "      <td>149</td>\n",
       "      <td>1</td>\n",
       "      <td>149.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2977</th>\n",
       "      <td>cinco de mayo festival</td>\n",
       "      <td>cinco de mayo events</td>\n",
       "      <td>742</td>\n",
       "      <td>5</td>\n",
       "      <td>148.400000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       node_a                       node_b  edge_weight  \\\n",
       "8431      memorial day events  memorial day weekend events         3164   \n",
       "4596       earth day festival             earth day events          412   \n",
       "8821                   mother                  mothers day          617   \n",
       "8432      memorial day events         memorial day weekend         1969   \n",
       "9045       mothers day brunch                  mothers day          534   \n",
       "5229                   father                  fathers day          174   \n",
       "14369                     yog                         yoga          159   \n",
       "2474           carnival dates                     carnival          158   \n",
       "3793                curl fest                     curlfest          149   \n",
       "2977   cinco de mayo festival         cinco de mayo events          742   \n",
       "\n",
       "       node_b_count  norm_edge_weight  \n",
       "8431             10        316.400000  \n",
       "4596              2        206.000000  \n",
       "8821              3        205.666667  \n",
       "8432             10        196.900000  \n",
       "9045              3        178.000000  \n",
       "5229              1        174.000000  \n",
       "14369             1        159.000000  \n",
       "2474              1        158.000000  \n",
       "3793              1        149.000000  \n",
       "2977              5        148.400000  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_times_node_b_is_with_another_query = synonym_graph.groupby('node_a').agg({'node_b':'count'})\n",
    "synonym_graph_norm = synonym_graph.set_index('node_a')\\\n",
    "    .join(num_times_node_b_is_with_another_query, rsuffix='_count')\\\n",
    "    .reset_index()\n",
    "synonym_graph_norm['norm_edge_weight'] = synonym_graph_norm.edge_weight / (synonym_graph_norm.node_b_count)\n",
    "synonym_graph_norm.sort_values('norm_edge_weight', ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create adjacency matrix of query text\n",
    "Rows and columns corresponding to every term. Values corresponding to the normalized edge weight."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2649, 2649)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query_mat = pd.crosstab(\n",
    "    synonym_graph_norm['node_a'],\n",
    "    synonym_graph_norm['node_b'],\n",
    "    synonym_graph_norm['norm_edge_weight'],\n",
    "    aggfunc='sum',\n",
    ").fillna(0)\n",
    "query_mat.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's look at a subset of the matrix."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>node_b</th>\n",
       "      <th>memorial day weekend events</th>\n",
       "      <th>memorial day weekend</th>\n",
       "      <th>memorial day events</th>\n",
       "      <th>mother</th>\n",
       "      <th>mothers day</th>\n",
       "      <th>mothers day brunch</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>node_a</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>memorial day weekend events</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>105.872340</td>\n",
       "      <td>67.319149</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.234043</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>memorial day weekend</th>\n",
       "      <td>73.176471</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>28.955882</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.794118</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>memorial day events</th>\n",
       "      <td>316.400000</td>\n",
       "      <td>196.900000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.100000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mother</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>205.666667</td>\n",
       "      <td>20.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mothers day</th>\n",
       "      <td>0.271028</td>\n",
       "      <td>0.252336</td>\n",
       "      <td>0.098131</td>\n",
       "      <td>2.883178</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.495327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mothers day brunch</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>20.666667</td>\n",
       "      <td>178.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "node_b                       memorial day weekend events  \\\n",
       "node_a                                                     \n",
       "memorial day weekend events                     0.000000   \n",
       "memorial day weekend                           73.176471   \n",
       "memorial day events                           316.400000   \n",
       "mother                                          0.000000   \n",
       "mothers day                                     0.271028   \n",
       "mothers day brunch                              0.000000   \n",
       "\n",
       "node_b                       memorial day weekend  memorial day events  \\\n",
       "node_a                                                                   \n",
       "memorial day weekend events            105.872340            67.319149   \n",
       "memorial day weekend                     0.000000            28.955882   \n",
       "memorial day events                    196.900000             0.000000   \n",
       "mother                                   0.000000             0.000000   \n",
       "mothers day                              0.252336             0.098131   \n",
       "mothers day brunch                       0.000000             0.000000   \n",
       "\n",
       "node_b                          mother  mothers day  mothers day brunch  \n",
       "node_a                                                                   \n",
       "memorial day weekend events   0.000000     1.234043            0.000000  \n",
       "memorial day weekend          0.000000     0.794118            0.000000  \n",
       "memorial day events           0.000000     2.100000            0.000000  \n",
       "mother                        0.000000   205.666667           20.666667  \n",
       "mothers day                   2.883178     0.000000            2.495327  \n",
       "mothers day brunch           20.666667   178.000000            0.000000  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "queries = ['memorial day weekend events', 'memorial day weekend', 'memorial day events', 'mother', 'mothers day', 'mothers day brunch']\n",
    "query_mat.loc[queries][queries]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Identify query clusters using \"Affinity Propagation\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "labels: [138   3   4 ... 679 680  41]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import AffinityPropagation\n",
    "from sklearn.externals import joblib\n",
    "\n",
    "query_affinity_file = 'data/query_string_clusters/query_affinity.mdl'\n",
    "try:\n",
    "    aff = joblib.load(query_affinity_file)\n",
    "    labels = aff.labels_\n",
    "except Exception as e: \n",
    "    aff = AffinityPropagation(\n",
    "        damping=.8,  \n",
    "        max_iter=200, \n",
    "        convergence_iter=20, \n",
    "        affinity='precomputed',\n",
    "    )\n",
    "    labels = aff.fit_predict(query_mat)\n",
    "    joblib.dump(aff, query_affinity_file)\n",
    "    \n",
    "print('labels:', labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "every query string gets a cluster number that is stored in `labels`\n",
    "\n",
    "let's collect all the queries together according to their label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>queries</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster_number</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>[april 27, april 28, april 29]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>277</th>\n",
       "      <td>[fluxx, parq]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>328</th>\n",
       "      <td>[hip ho, hip hop, hiphop, rap, rb]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>425</th>\n",
       "      <td>[advertising, brand, branding, communication, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>238</th>\n",
       "      <td>[conferencia dunamis, conferência dunamis, dun...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>442</th>\n",
       "      <td>[method man]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>529</th>\n",
       "      <td>[bb kings, reggae fest, reggaefest]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>422</th>\n",
       "      <td>[make up classes, makeup class, makeup classes...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>[anime north 2018]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>290</th>\n",
       "      <td>[game, games]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                          queries\n",
       "cluster_number                                                   \n",
       "38                                 [april 27, april 28, april 29]\n",
       "277                                                 [fluxx, parq]\n",
       "328                            [hip ho, hip hop, hiphop, rap, rb]\n",
       "425             [advertising, brand, branding, communication, ...\n",
       "238             [conferencia dunamis, conferência dunamis, dun...\n",
       "442                                                  [method man]\n",
       "529                           [bb kings, reggae fest, reggaefest]\n",
       "422             [make up classes, makeup class, makeup classes...\n",
       "34                                             [anime north 2018]\n",
       "290                                                 [game, games]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query_families = pd.DataFrame(\n",
    "    list((zip(labels, query_mat.index))),\n",
    "    columns=['cluster_number', 'queries'],\n",
    ").groupby(\n",
    "    'cluster_number'\n",
    ").agg({'queries':lambda x: list(x)})\n",
    "query_families.sample(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`aff.cluster_centers_indices_` indicates which query is the \"center\" or \"exemplar\" of each cluster.\n",
    "\n",
    "Pull the exemplar cluster into a new column.\n",
    "\n",
    "Notice:\n",
    "* The queries in the cluster make sense.\n",
    "* The exemplars are the \"best\" of the cluster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>queries</th>\n",
       "      <th>num_queries</th>\n",
       "      <th>exemplar</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster_number</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>459</th>\n",
       "      <td>[fashion week casting call, league of legends, lol, msi]</td>\n",
       "      <td>4</td>\n",
       "      <td>msi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>619</th>\n",
       "      <td>[free yoga class, tequila]</td>\n",
       "      <td>2</td>\n",
       "      <td>tequila</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>479</th>\n",
       "      <td>[oak room, oakroom]</td>\n",
       "      <td>2</td>\n",
       "      <td>oakroom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>177</th>\n",
       "      <td>[couples, date night, marriage]</td>\n",
       "      <td>3</td>\n",
       "      <td>couples</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>648</th>\n",
       "      <td>[free vending event for vendors, vendors needed, vendors wanted]</td>\n",
       "      <td>3</td>\n",
       "      <td>vendors needed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>336</th>\n",
       "      <td>[hospitality, hotel]</td>\n",
       "      <td>2</td>\n",
       "      <td>hotel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455</th>\n",
       "      <td>[moonrise, moonrise festival]</td>\n",
       "      <td>2</td>\n",
       "      <td>moonrise festival</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>527</th>\n",
       "      <td>[red rocks, redrocks, redrocks h street]</td>\n",
       "      <td>3</td>\n",
       "      <td>red rocks</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>355</th>\n",
       "      <td>[islam, islamic, islamic event, quran]</td>\n",
       "      <td>4</td>\n",
       "      <td>islam</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>[ai, bitcoin, block, block chain, blockchai, blockchain, blockchain week, consensus, crypto, cryptocurrencies, cryptocurrency, eos, ethereum, hyperledger, ico]</td>\n",
       "      <td>15</td>\n",
       "      <td>blockchain</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                        queries  \\\n",
       "cluster_number                                                                                                                                                                    \n",
       "459                                                                                                                    [fashion week casting call, league of legends, lol, msi]   \n",
       "619                                                                                                                                                  [free yoga class, tequila]   \n",
       "479                                                                                                                                                         [oak room, oakroom]   \n",
       "177                                                                                                                                             [couples, date night, marriage]   \n",
       "648                                                                                                            [free vending event for vendors, vendors needed, vendors wanted]   \n",
       "336                                                                                                                                                        [hospitality, hotel]   \n",
       "455                                                                                                                                               [moonrise, moonrise festival]   \n",
       "527                                                                                                                                    [red rocks, redrocks, redrocks h street]   \n",
       "355                                                                                                                                      [islam, islamic, islamic event, quran]   \n",
       "85              [ai, bitcoin, block, block chain, blockchai, blockchain, blockchain week, consensus, crypto, cryptocurrencies, cryptocurrency, eos, ethereum, hyperledger, ico]   \n",
       "\n",
       "                num_queries           exemplar  \n",
       "cluster_number                                  \n",
       "459                       4                msi  \n",
       "619                       2            tequila  \n",
       "479                       2            oakroom  \n",
       "177                       3            couples  \n",
       "648                       3     vendors needed  \n",
       "336                       2              hotel  \n",
       "455                       2  moonrise festival  \n",
       "527                       3          red rocks  \n",
       "355                       4              islam  \n",
       "85                       15         blockchain  "
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add in colums for the number of queries\n",
    "query_families['num_queries'] = query_families['queries'].apply(lambda x: len(x))\n",
    "query_families['exemplar'] = query_mat.index[aff.cluster_centers_indices_]\n",
    "\n",
    "query_families.sample(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reshaping the data to build a better tagging model\n",
    "We see that the affinity analysis appears to be working, but we need to reshape the data so that we can use it.\n",
    "\n",
    "Requirement: given a *raw query string* we need to know 2 things\n",
    "1. What is the exemplar query string for this query?\n",
    "2. How \"strong\" is this query in relation to it's exemplar?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get portion of query_mat that corresponds to the exemplars\n",
    "exemplar_query_mat = query_mat.iloc[aff.cluster_centers_indices_]\n",
    "\n",
    "# get artifical max score for each query (TODO improve)\n",
    "query_score = synonym_graph_norm.groupby('node_a').agg({'norm_edge_weight': 'max'}) * 1.1\n",
    "\n",
    "# create version of exemplar_query_mat that zeros out all the values that don't correspond to clustered queries\n",
    "masked_exemplar_query_mat = exemplar_query_mat.copy()*0\n",
    "\n",
    "for i, row in query_families.iterrows():\n",
    "    masked_exemplar_query_mat.loc[row['exemplar']][row['queries']] = 1\n",
    " \n",
    "masked_exemplar_query_mat = masked_exemplar_query_mat * exemplar_query_mat\n",
    "\n",
    "for i, row in query_families.iterrows():\n",
    "    query_text = row['exemplar']\n",
    "    masked_exemplar_query_mat.loc[query_text][query_text] = query_score.loc[query_text]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: \n",
      "Passing list-likes to .loc or [] with any missing label will raise\n",
      "KeyError in the future, you can use .reindex() as an alternative.\n",
      "\n",
      "See the documentation here:\n",
      "https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>node_b</th>\n",
       "      <th>machine learning</th>\n",
       "      <th>artificial intelligence</th>\n",
       "      <th>bitcoin</th>\n",
       "      <th>block</th>\n",
       "      <th>block chain</th>\n",
       "      <th>blockchai</th>\n",
       "      <th>deep learning</th>\n",
       "      <th>python</th>\n",
       "      <th>blockchain</th>\n",
       "      <th>blockchain week</th>\n",
       "      <th>tensorflow</th>\n",
       "      <th>consensus</th>\n",
       "      <th>crypto</th>\n",
       "      <th>cryptocurrency</th>\n",
       "      <th>ethereum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>node_a</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>machine learning</th>\n",
       "      <td>11.733333</td>\n",
       "      <td>10.666667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6.8</td>\n",
       "      <td>6.4</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.8</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>artificial intelligence</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bitcoin</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>block</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>block chain</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>blockchai</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>deep learning</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>python</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>blockchain</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6.0</td>\n",
       "      <td>0.361905</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.466667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.774286</td>\n",
       "      <td>0.295238</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.438095</td>\n",
       "      <td>8.885714</td>\n",
       "      <td>4.47619</td>\n",
       "      <td>1.171429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>blockchain week</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tensorflow</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>consensus</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>crypto</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cryptocurrency</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ethereum</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "node_b                   machine learning  artificial intelligence  bitcoin  \\\n",
       "node_a                                                                        \n",
       "machine learning                11.733333                10.666667      0.0   \n",
       "artificial intelligence               NaN                      NaN      NaN   \n",
       "bitcoin                               NaN                      NaN      NaN   \n",
       "block                                 NaN                      NaN      NaN   \n",
       "block chain                           NaN                      NaN      NaN   \n",
       "blockchai                             NaN                      NaN      NaN   \n",
       "deep learning                         NaN                      NaN      NaN   \n",
       "python                                NaN                      NaN      NaN   \n",
       "blockchain                       0.000000                 0.000000      6.0   \n",
       "blockchain week                       NaN                      NaN      NaN   \n",
       "tensorflow                            NaN                      NaN      NaN   \n",
       "consensus                             NaN                      NaN      NaN   \n",
       "crypto                                NaN                      NaN      NaN   \n",
       "cryptocurrency                        NaN                      NaN      NaN   \n",
       "ethereum                              NaN                      NaN      NaN   \n",
       "\n",
       "node_b                      block  block chain  blockchai  deep learning  \\\n",
       "node_a                                                                     \n",
       "machine learning         0.000000          0.0   0.000000            6.8   \n",
       "artificial intelligence       NaN          NaN        NaN            NaN   \n",
       "bitcoin                       NaN          NaN        NaN            NaN   \n",
       "block                         NaN          NaN        NaN            NaN   \n",
       "block chain                   NaN          NaN        NaN            NaN   \n",
       "blockchai                     NaN          NaN        NaN            NaN   \n",
       "deep learning                 NaN          NaN        NaN            NaN   \n",
       "python                        NaN          NaN        NaN            NaN   \n",
       "blockchain               0.361905          0.6   0.466667            0.0   \n",
       "blockchain week               NaN          NaN        NaN            NaN   \n",
       "tensorflow                    NaN          NaN        NaN            NaN   \n",
       "consensus                     NaN          NaN        NaN            NaN   \n",
       "crypto                        NaN          NaN        NaN            NaN   \n",
       "cryptocurrency                NaN          NaN        NaN            NaN   \n",
       "ethereum                      NaN          NaN        NaN            NaN   \n",
       "\n",
       "node_b                   python  blockchain  blockchain week  tensorflow  \\\n",
       "node_a                                                                     \n",
       "machine learning            6.4    0.000000         0.000000         1.8   \n",
       "artificial intelligence     NaN         NaN              NaN         NaN   \n",
       "bitcoin                     NaN         NaN              NaN         NaN   \n",
       "block                       NaN         NaN              NaN         NaN   \n",
       "block chain                 NaN         NaN              NaN         NaN   \n",
       "blockchai                   NaN         NaN              NaN         NaN   \n",
       "deep learning               NaN         NaN              NaN         NaN   \n",
       "python                      NaN         NaN              NaN         NaN   \n",
       "blockchain                  0.0    9.774286         0.295238         0.0   \n",
       "blockchain week             NaN         NaN              NaN         NaN   \n",
       "tensorflow                  NaN         NaN              NaN         NaN   \n",
       "consensus                   NaN         NaN              NaN         NaN   \n",
       "crypto                      NaN         NaN              NaN         NaN   \n",
       "cryptocurrency              NaN         NaN              NaN         NaN   \n",
       "ethereum                    NaN         NaN              NaN         NaN   \n",
       "\n",
       "node_b                   consensus    crypto  cryptocurrency  ethereum  \n",
       "node_a                                                                  \n",
       "machine learning          0.000000  0.000000         0.00000  0.000000  \n",
       "artificial intelligence        NaN       NaN             NaN       NaN  \n",
       "bitcoin                        NaN       NaN             NaN       NaN  \n",
       "block                          NaN       NaN             NaN       NaN  \n",
       "block chain                    NaN       NaN             NaN       NaN  \n",
       "blockchai                      NaN       NaN             NaN       NaN  \n",
       "deep learning                  NaN       NaN             NaN       NaN  \n",
       "python                         NaN       NaN             NaN       NaN  \n",
       "blockchain                0.438095  8.885714         4.47619  1.171429  \n",
       "blockchain week                NaN       NaN             NaN       NaN  \n",
       "tensorflow                     NaN       NaN             NaN       NaN  \n",
       "consensus                      NaN       NaN             NaN       NaN  \n",
       "crypto                         NaN       NaN             NaN       NaN  \n",
       "cryptocurrency                 NaN       NaN             NaN       NaN  \n",
       "ethereum                       NaN       NaN             NaN       NaN  "
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "queries = ['machine learning', 'artificial intelligence', 'bitcoin', 'block', 'block chain', 'blockchai', 'deep learning', 'python', 'blockchain', 'blockchain week', 'tensorflow', 'consensus', 'crypto','cryptocurrency', 'ethereum']\n",
    "\n",
    "masked_exemplar_query_mat.loc[queries][queries]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "save the matrices for later"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os.path\n",
    "\n",
    "masked_exemplar_query_mat_file = 'data/masked_exemplar_query_mat.csv'\n",
    "exemplar_query_mat_file = 'data/exemplar_query_mat.csv'\n",
    "\n",
    "if not os.path.isfile(masked_exemplar_query_mat_file):\n",
    "    masked_exemplar_query_mat.to_csv(masked_exemplar_query_mat_file)\n",
    "if not os.path.isfile(exemplar_query_mat_file):\n",
    "    exemplar_query_mat.to_csv(exemplar_query_mat_file)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}