phenders/dedupe.ipynb

## dedupe.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('csv_example/restaurants_pairs.csv', dtype={'phone': str, 'yelp_phone': str})\n",
    "df['yelp_phone'] = df['yelp_phone'].str.replace('\\+1','')\n",
    "df1 = df[['dba', 'address', 'zipcode', 'phone']]\n",
    "df2 = df[['yelp_name', 'yelp_address', 'yelp_zip_code', 'yelp_phone']]\n",
    "df1.columns = ['Name', 'Address', 'Zip', 'Phone']\n",
    "df2.columns = ['Name', 'Address', 'Zip', 'Phone']\n",
    "df = pd.concat([df1, df2]).sort_index().reset_index(drop=True)\n",
    "df['City'] = 'New York'\n",
    "df['State'] = 'NY'\n",
    "df = df[['Name', 'Address', 'City', 'State', 'Zip']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Review the input dataset\n",
    "\n",
    "For the restaurant app last year, we had to match restaurant names and addresses in the NYC inspections dataset with restaurant names and addresses in Yelp. We were able to match 95% automatically using a Levenshtein-based algorithm, but resorted to manual matching for the remaining 5%. This dataset contains restaurants we matched manually, shown here as matched pairs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "      <th>Zip</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brunos On The Boulevard</td>\n",
       "      <td>8825 Astoria Blvd</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11369</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Events By Bruno's</td>\n",
       "      <td>88-25 Astoria Blvd</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11369</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Bens Best</td>\n",
       "      <td>9640 Queens Blvd</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11374</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Ben-best Deli &amp; Restaurant</td>\n",
       "      <td>9640 Queens Blvd</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11374</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Keats Restaurant</td>\n",
       "      <td>842 2nd Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>10017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Keats Bar</td>\n",
       "      <td>842 2nd Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>10017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Michael's Restaurant</td>\n",
       "      <td>2929 Avenue R</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11229</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Michael's of Brooklyn</td>\n",
       "      <td>2929 Ave R</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11229</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Lulu's Coffee Shop</td>\n",
       "      <td>1191 Castle Hill Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>10462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Lulu's Luncheonette</td>\n",
       "      <td>1191 Castle Hill Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>10462</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         Name               Address      City State    Zip\n",
       "0     Brunos On The Boulevard     8825 Astoria Blvd  New York    NY  11369\n",
       "1           Events By Bruno's    88-25 Astoria Blvd  New York    NY  11369\n",
       "2                   Bens Best      9640 Queens Blvd  New York    NY  11374\n",
       "3  Ben-best Deli & Restaurant      9640 Queens Blvd  New York    NY  11374\n",
       "4            Keats Restaurant           842 2nd Ave  New York    NY  10017\n",
       "5                   Keats Bar           842 2nd Ave  New York    NY  10017\n",
       "6        Michael's Restaurant         2929 Avenue R  New York    NY  11229\n",
       "7       Michael's of Brooklyn            2929 Ave R  New York    NY  11229\n",
       "8          Lulu's Coffee Shop  1191 Castle Hill Ave  New York    NY  10462\n",
       "9         Lulu's Luncheonette  1191 Castle Hill Ave  New York    NY  10462"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Shuffle the list\n",
    "\n",
    "We don't want to feed the names and addresses into `dedupe` as matched pairs, so randomly shuffle the list."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "      <th>Zip</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>J &amp; L Groceries Store</td>\n",
       "      <td>682 5th Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11215</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Lua</td>\n",
       "      <td>1006 Flushing Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Times Stamps Hot Pot</td>\n",
       "      <td>811 53rd St</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11220</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Taco Tortilleria Incorporated</td>\n",
       "      <td>13408 Jamaica Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11418</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HoM Bay Ridge</td>\n",
       "      <td>8810 3rd Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Patrizias Of Brooklyn</td>\n",
       "      <td>462 2nd Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>10016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Joe</td>\n",
       "      <td>187 Columbus Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>10023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Peking BBQ</td>\n",
       "      <td>58-11 Woodside Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11377</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Sabrina's Broadway Pizzeria</td>\n",
       "      <td>294 Broadway</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11211</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>King Chicken Pizza Burgers</td>\n",
       "      <td>11623 Jamaica Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11418</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            Name             Address      City State    Zip\n",
       "0          J & L Groceries Store         682 5th Ave  New York    NY  11215\n",
       "1                            Lua   1006 Flushing Ave  New York    NY  11237\n",
       "2           Times Stamps Hot Pot         811 53rd St  New York    NY  11220\n",
       "3  Taco Tortilleria Incorporated   13408 Jamaica Ave  New York    NY  11418\n",
       "4                  HoM Bay Ridge        8810 3rd Ave  New York    NY  11209\n",
       "5          Patrizias Of Brooklyn         462 2nd Ave  New York    NY  10016\n",
       "6                            Joe    187 Columbus Ave  New York    NY  10023\n",
       "7                     Peking BBQ  58-11 Woodside Ave  New York    NY  11377\n",
       "8    Sabrina's Broadway Pizzeria        294 Broadway  New York    NY  11211\n",
       "9     King Chicken Pizza Burgers   11623 Jamaica Ave  New York    NY  11418"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.sample(frac=1).reset_index(drop=True)\n",
    "df.to_csv('csv_example/restaurants_all.csv', index_label='Id')\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set up dedupe\n",
    "\n",
    "This is setup code directly from one of the `dedupe` examples: https://github.com/dedupeio/dedupe-examples/blob/master/csv_example/csv_example.py. The only change is the input and output filenames."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from future.builtins import next\n",
    "\n",
    "import os\n",
    "import csv\n",
    "import re\n",
    "import logging\n",
    "import optparse\n",
    "\n",
    "import dedupe\n",
    "from unidecode import unidecode\n",
    "\n",
    "input_file = 'csv_example/restaurants_all.csv' # This file is required; the others are created a runtime\n",
    "output_file = 'csv_example/restaurants_out.csv'\n",
    "settings_file = 'csv_example/learned_settings'\n",
    "training_file = 'csv_example/training.json'\n",
    "\n",
    "def preProcess(column):\n",
    "    \"\"\"\n",
    "    Do a little bit of data cleaning with the help of Unidecode and Regex.\n",
    "    Things like casing, extra spaces, quotes and new lines can be ignored.\n",
    "    \"\"\"\n",
    "    try : # python 2/3 string differences\n",
    "        column = column.decode('utf8')\n",
    "    except AttributeError:\n",
    "        pass\n",
    "    column = unidecode(column)\n",
    "    column = re.sub('  +', ' ', column)\n",
    "    column = re.sub('\\n', ' ', column)\n",
    "    column = column.strip().strip('\"').strip(\"'\").lower().strip()\n",
    "    # If data is missing, indicate that by setting the value to `None`\n",
    "    if not column:\n",
    "        column = None\n",
    "    return column\n",
    "\n",
    "def readData(filename):\n",
    "    \"\"\"\n",
    "    Read in our data from a CSV file and create a dictionary of records, \n",
    "    where the key is a unique record ID and each value is dict\n",
    "    \"\"\"\n",
    "\n",
    "    data_d = {}\n",
    "    with open(filename) as f:\n",
    "        reader = csv.DictReader(f)\n",
    "        for row in reader:\n",
    "            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]\n",
    "            row_id = int(row['Id'])\n",
    "            data_d[row_id] = dict(clean_row)\n",
    "\n",
    "    return data_d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import the data and train the model\n",
    "\n",
    "Again, this is code from directly one of the `dedupe` examples: https://github.com/dedupeio/dedupe-examples/blob/master/csv_example/csv_example.py. The only change is to define the fields of interest in the source dataset. Using the `Name` and `Address` types requires installing a couple of optional libraries, as described here: https://docs.dedupe.io/en/latest/Variable-definition.html#optional-variables.\n",
    "\n",
    "You can see in the output how dedupe presents record pairs to train the model. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "importing data ...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : ali baba's turkish cuisine\n",
      "Address : 862 2nd ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10017\n",
      "\n",
      "Name : ali baba's terrace\n",
      "Address : 862 2nd ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10017\n",
      "\n",
      "0/10 positive, 0/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "starting active labeling...\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : merry land chinese restaurant\n",
      "Address : 325 e 149th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10451\n",
      "\n",
      "Name : merry land buffet\n",
      "Address : 325 e 149th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10451\n",
      "\n",
      "1/10 positive, 0/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), TfidfTextCanopyPredicate: (0.6, Name))\n",
      "Name : villa monte pizzeria & restaurant\n",
      "Address : 2811 richmond ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10314\n",
      "\n",
      "Name : villa monte pizzeria\n",
      "Address : 240 arden ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10312\n",
      "\n",
      "2/10 positive, 0/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : harway foods\n",
      "Address : 2863 harway ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11214\n",
      "\n",
      "Name : spoonfed nyc restaurant\n",
      "Address : 331 w 51st st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10019\n",
      "\n",
      "2/10 positive, 1/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : brother's pizza two\n",
      "Address : 95 page ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10309\n",
      "\n",
      "Name : the dl\n",
      "Address : 95 delancey st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10002\n",
      "\n",
      "2/10 positive, 2/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : ceci's restaurant\n",
      "Address : 423 hegeman ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11207\n",
      "\n",
      "Name : ceci's latin cuisine\n",
      "Address : 423 hegeman ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11207\n",
      "\n",
      "2/10 positive, 3/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : brooklyn diner\n",
      "Address : 5922-24 avenue n\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11234\n",
      "\n",
      "Name : first choice restaurant\n",
      "Address : 3893a broadway\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10032\n",
      "\n",
      "3/10 positive, 3/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), TfidfTextCanopyPredicate: (0.6, Name))\n",
      "INFO:dedupe.training:(PartialPredicate: (sameFiveCharStartPredicate, Name, CorporationName), TfidfTextCanopyPredicate: (0.8, Address))\n",
      "Name : sabrina's pizzeria & restaurant\n",
      "Address : 294 broadway\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11211\n",
      "\n",
      "Name : lucky chen chinese restaraunts\n",
      "Address : 1266 broadway\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11221\n",
      "\n",
      "3/10 positive, 4/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : ginger vegetarian\n",
      "Address : 310 ditmas ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11218\n",
      "\n",
      "Name : ginger house\n",
      "Address : 310 ditmas ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11218\n",
      "\n",
      "3/10 positive, 5/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : el anzuelo fino restaurant\n",
      "Address : 9801 jamaica ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11421\n",
      "\n",
      "Name : el anzuelo fino - woodhaven\n",
      "Address : 98-01 jamaica ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11421\n",
      "\n",
      "4/10 positive, 5/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, Name), TfidfTextCanopyPredicate: (0.8, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (suffixArray, Address, StreetName))\n",
      "Name : m2n cafeteria\n",
      "Address : 40-12 82nd st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11373\n",
      "\n",
      "Name : m2n buffet\n",
      "Address : 4012 82nd st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11373\n",
      "\n",
      "5/10 positive, 5/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, Name), TfidfTextCanopyPredicate: (0.8, Address))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (suffixArray, Address, StreetName))\n",
      "Name : john's of times square\n",
      "Address : 260 w 44th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10036\n",
      "\n",
      "Name : john's pizza\n",
      "Address : 260 w 44th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10036\n",
      "\n",
      "6/10 positive, 5/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, Name), TfidfTextCanopyPredicate: (0.8, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (alphaNumericPredicate, Name, CorporationName))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (suffixArray, Address, StreetName))\n",
      "Name : emilio's ballato\n",
      "Address : 55 e houston st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10012\n",
      "\n",
      "Name : ballato's restaurant\n",
      "Address : 55 east houston st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10012\n",
      "\n",
      "7/10 positive, 5/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Name), TfidfTextCanopyPredicate: (0.8, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (alphaNumericPredicate, Name, CorporationName))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n",
      "Name : harlem karibe restaurant & catering\n",
      "Address : 2234 adam clayton powell jr blvd\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10027\n",
      "\n",
      "Name : harlem karibe take out\n",
      "Address : 2234 adam clayton powel jr blvd\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10027\n",
      "\n",
      "8/10 positive, 5/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Name), TfidfTextCanopyPredicate: (0.8, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (alphaNumericPredicate, Name, CorporationName))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n",
      "INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.2, Name), TfidfTextCanopyPredicate: (0.6, Address))\n",
      "Name : royal seafood restaurant\n",
      "Address : 103 mott st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10013\n",
      "\n",
      "Name : royal seafood cuisine\n",
      "Address : 103-105 mott st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10013\n",
      "\n",
      "9/10 positive, 5/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Name), TfidfTextCanopyPredicate: (0.8, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (alphaNumericPredicate, Name, CorporationName))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, Address, StreetName), TfidfNGramCanopyPredicate: (0.6, Name))\n",
      "INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.2, Name), TfidfTextCanopyPredicate: (0.6, Address))\n",
      "Name : bobby van's grill\n",
      "Address : 120 w 45th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10036\n",
      "\n",
      "Name : hop won restaurant\n",
      "Address : 139 e 45th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10017\n",
      "\n",
      "10/10 positive, 5/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Name), TfidfTextCanopyPredicate: (0.6, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (alphaNumericPredicate, Name, CorporationName))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n",
      "INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.2, Name), TfidfTextCanopyPredicate: (0.6, Address))\n",
      "Name : b&b restaurant corp\n",
      "Address : 165 w 26th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10001\n",
      "\n",
      "Name : b and b restaurant\n",
      "Address : 165 w 26th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10001\n",
      "\n",
      "10/10 positive, 6/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : fob restaurant\n",
      "Address : 271 smith st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11231\n",
      "\n",
      "Name : fob brooklyn\n",
      "Address : 271 smith st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11231\n",
      "\n",
      "11/10 positive, 6/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Name), TfidfTextCanopyPredicate: (0.6, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (alphaNumericPredicate, Name, CorporationName))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), TfidfTextCanopyPredicate: (0.8, Name))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n",
      "INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.2, Name), TfidfTextCanopyPredicate: (0.6, Address))\n",
      "Name : lollipops ice cream\n",
      "Address : 4120 baychester ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10466\n",
      "\n",
      "Name : asea fusion modern asian bistro\n",
      "Address : 4120 8th ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11232\n",
      "\n",
      "12/10 positive, 6/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, Address), TfidfTextCanopyPredicate: (0.6, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (alphaNumericPredicate, Name, CorporationName))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n",
      "Name : best north dumpling shop dong bei jiao zi wang\n",
      "Address : 41-42a main st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11355\n",
      "\n",
      "Name : great wall chinese restaurant\n",
      "Address : 4341 main st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11355\n",
      "\n",
      "12/10 positive, 7/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : no. 1 kitchen astoria\n",
      "Address : 3023 30th ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11102\n",
      "\n",
      "Name : flo\n",
      "Address : 3720 30th ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11103\n",
      "\n",
      "12/10 positive, 8/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : antalia nyc\n",
      "Address : 17 w 45th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10036\n",
      "\n",
      "Name : rico's chicken colombian restaurant\n",
      "Address : 146-04 45th ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11355\n",
      "\n",
      "12/10 positive, 9/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : tabata 2 japanese noodle restaurant\n",
      "Address : 557 8th ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10018\n",
      "\n",
      "Name : pax\n",
      "Address : 520 8th ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10018\n",
      "\n",
      "12/10 positive, 10/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : waverly diner\n",
      "Address : 385 6th ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10014\n",
      "\n",
      "Name : kami asian restaurant\n",
      "Address : 385 flatbush ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11238\n",
      "\n",
      "12/10 positive, 11/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : brooklyn diner\n",
      "Address : 5922-24 avenue n\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11234\n",
      "\n",
      "Name : mythos restaurant\n",
      "Address : 19629 northern blvd\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11358\n",
      "\n",
      "12/10 positive, 12/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : hector's cafe-restaurant\n",
      "Address : 44 little w 12th st\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 10014\n",
      "\n",
      "Name : hot bagels on 5th\n",
      "Address : 523 5th ave\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11215\n",
      "\n",
      "12/10 positive, 13/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Name : my three sons coffee shop\n",
      "Address : 7121 fort hamilton pkwy\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11228\n",
      "\n",
      "Name : new york sports bar\n",
      "Address : john f kennedy international airport\n",
      "City : new york\n",
      "State : ny\n",
      "Zip : 11464\n",
      "\n",
      "12/10 positive, 14/10 negative\n",
      "Do these records refer to the same thing?\n",
      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      " f\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Finished labeling\n",
      "INFO:rlr.crossvalidation:using cross validation to find optimum alpha...\n",
      "INFO:rlr.crossvalidation:optimum alpha: 0.010000, score 0.6898750594287656\n",
      "INFO:dedupe.training:Final predicate set:\n",
      "INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, Address), TfidfTextCanopyPredicate: (0.6, Address))\n",
      "INFO:dedupe.training:(PartialPredicate: (alphaNumericPredicate, Address, StreetName), PartialPredicate: (alphaNumericPredicate, Name, CorporationName))\n",
      "INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Name), SimplePredicate: (wholeFieldPredicate, Address))\n"
     ]
    }
   ],
   "source": [
    "print('importing data ...')\n",
    "data_d = readData(input_file)\n",
    "\n",
    "# If a settings file already exists, load that and skip training\n",
    "if os.path.exists(settings_file):\n",
    "    print('reading from', settings_file)\n",
    "    with open(settings_file, 'rb') as f:\n",
    "        deduper = dedupe.StaticDedupe(f)\n",
    "else:\n",
    "    # ## Training\n",
    "\n",
    "    # TODO Define the fields dedupe will pay attention to\n",
    "    fields = [\n",
    "        {'field' : 'Name', 'type': 'Name'},\n",
    "        {'field' : 'Address', 'type': 'Address'},\n",
    "        {'field' : 'City', 'type': 'String'},\n",
    "        {'field' : 'State', 'type': 'Exact'},\n",
    "        {'field' : 'Zip', 'type': 'Exact'}\n",
    "    ]\n",
    "\n",
    "    # Create a new deduper object and pass our data model to it\n",
    "    deduper = dedupe.Dedupe(fields)\n",
    "\n",
    "    # To train dedupe, feed it a sample of records\n",
    "    deduper.sample(data_d, 500) # was 15000\n",
    "\n",
    "    # If we have training data saved from a previous run of dedupe,\n",
    "    # look for it and load it in.\n",
    "    # __Note:__ if you want to train from scratch, delete the training_file\n",
    "    if os.path.exists(training_file):\n",
    "        print('reading labeled examples from ', training_file)\n",
    "        with open(training_file, 'rb') as f:\n",
    "            deduper.readTraining(f)\n",
    "\n",
    "    # ## Active learning\n",
    "    # Dedupe will find the next pair of records it is least certain about\n",
    "    # and ask you to label them as duplicates or not\n",
    "    # Use 'y', 'n' and 'u' keys to flag duplicates; press 'f' when finished\n",
    "    print('starting active labeling...')\n",
    "\n",
    "    dedupe.consoleLabel(deduper)\n",
    "\n",
    "    # Using the examples just labeled, train the deduper and learn blocking predicates\n",
    "    deduper.train()\n",
    "\n",
    "    # When finished, save our training to disk\n",
    "    with open(training_file, 'w') as tf:\n",
    "        deduper.writeTraining(tf)\n",
    "\n",
    "    # Save our weights and predicates to disk. If the settings file exists,\n",
    "    # skip all the training and learning next time we run this file\n",
    "    with open(settings_file, 'wb') as sf:\n",
    "        deduper.writeSettings(sf)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run dedupe\n",
    "\n",
    "The next step does the actual entity resolution. `dedupe` was able to match 488 of the 593 pairs we matched manually. Keep in mind that we were able to use the phone number as well to help with resolution in many cases when we did this manually, but the phone numbers are often different, so I excluded them from the `dedupe` model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dedupe.api:Maximum expected recall and precision\n",
      "INFO:dedupe.api:recall: 1.000\n",
      "INFO:dedupe.api:precision: 0.999\n",
      "INFO:dedupe.api:With threshold: 0.941\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "clustering...\n",
      "# duplicate sets 488\n"
     ]
    }
   ],
   "source": [
    "# Find the threshold that will maximize a weighted average of precision and recall.\n",
    "# When the recall weight is 2, we are saying we care twice as much about recall as precision.\n",
    "# If we had more data, we would not pass all the blocked data to this function, but a representative sample.\n",
    "\n",
    "threshold = deduper.threshold(data_d, recall_weight=1)\n",
    "\n",
    "# ## Clustering\n",
    "\n",
    "# `match` will return sets of record IDs that dedupe believes are all referring to the same entity.\n",
    "print('clustering...')\n",
    "clustered_dupes = deduper.match(data_d, threshold)\n",
    "\n",
    "print('# duplicate sets', len(clustered_dupes))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write the results to disk\n",
    "\n",
    "This code too is directly from the `dedupe` example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ## Writing Results\n",
    "\n",
    "# Write our original data back out to a CSV with a new column called \n",
    "# 'Cluster ID' which indicates which records refer to each other.\n",
    "\n",
    "cluster_membership = {}\n",
    "cluster_id = 0\n",
    "for (cluster_id, cluster) in enumerate(clustered_dupes):\n",
    "    id_set, scores = cluster\n",
    "    cluster_d = [data_d[c] for c in id_set]\n",
    "    canonical_rep = dedupe.canonicalize(cluster_d)\n",
    "    for record_id, score in zip(id_set, scores):\n",
    "        cluster_membership[record_id] = {\n",
    "            \"cluster id\" : cluster_id,\n",
    "            \"canonical representation\" : canonical_rep,\n",
    "            \"confidence\": score\n",
    "        }\n",
    "\n",
    "singleton_id = cluster_id + 1\n",
    "\n",
    "with open(output_file, 'w') as f_output, open(input_file) as f_input:\n",
    "    writer = csv.writer(f_output)\n",
    "    reader = csv.reader(f_input)\n",
    "\n",
    "    heading_row = next(reader)\n",
    "    heading_row.insert(0, 'confidence_score')\n",
    "    heading_row.insert(0, 'Cluster ID')\n",
    "    canonical_keys = canonical_rep.keys()\n",
    "    for key in canonical_keys:\n",
    "        heading_row.append('canonical_' + key)\n",
    "\n",
    "    writer.writerow(heading_row)\n",
    "\n",
    "    for row in reader:\n",
    "        row_id = int(row[0])\n",
    "        if row_id in cluster_membership:\n",
    "            cluster_id = cluster_membership[row_id][\"cluster id\"]\n",
    "            canonical_rep = cluster_membership[row_id][\"canonical representation\"]\n",
    "            row.insert(0, cluster_membership[row_id]['confidence'])\n",
    "            row.insert(0, cluster_id)\n",
    "            for key in canonical_keys:\n",
    "                row.append(canonical_rep[key].encode('utf8'))\n",
    "        else:\n",
    "            row.insert(0, None)\n",
    "            row.insert(0, singleton_id)\n",
    "            singleton_id += 1\n",
    "            for key in canonical_keys:\n",
    "                row.append(None)\n",
    "        writer.writerow(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## View the results\n",
    "\n",
    "Here are the first few rows of the results table, showing the matched pairs and the confidence scores. Each pair is given a cluster ID. The \"canonical name\" etc. is just the first instance in the pair."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cluster ID</th>\n",
       "      <th>confidence_score</th>\n",
       "      <th>Name</th>\n",
       "      <th>Address</th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "      <th>Zip</th>\n",
       "      <th>canonical_Id</th>\n",
       "      <th>canonical_Name</th>\n",
       "      <th>canonical_Address</th>\n",
       "      <th>canonical_City</th>\n",
       "      <th>canonical_State</th>\n",
       "      <th>canonical_Zip</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0.999961</td>\n",
       "      <td>J &amp; L Groceries Store</td>\n",
       "      <td>682 5th Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11215</td>\n",
       "      <td>b'0'</td>\n",
       "      <td>b'j &amp; l groceries store'</td>\n",
       "      <td>b'682 5th ave'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11215'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>246</th>\n",
       "      <td>0</td>\n",
       "      <td>0.999961</td>\n",
       "      <td>J and L Deli</td>\n",
       "      <td>682 5th Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11215</td>\n",
       "      <td>b'0'</td>\n",
       "      <td>b'j &amp; l groceries store'</td>\n",
       "      <td>b'682 5th ave'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11215'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0.999889</td>\n",
       "      <td>Lua</td>\n",
       "      <td>1006 Flushing Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11237</td>\n",
       "      <td>b'1'</td>\n",
       "      <td>b'lua'</td>\n",
       "      <td>b'1006 flushing ave'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11237'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>828</th>\n",
       "      <td>1</td>\n",
       "      <td>0.999889</td>\n",
       "      <td>Lua Bar</td>\n",
       "      <td>1006 Flushing Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11237</td>\n",
       "      <td>b'1'</td>\n",
       "      <td>b'lua'</td>\n",
       "      <td>b'1006 flushing ave'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11237'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0.999633</td>\n",
       "      <td>Times Stamps Hot Pot</td>\n",
       "      <td>811 53rd St</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11220</td>\n",
       "      <td>b'2'</td>\n",
       "      <td>b'times stamps hot pot'</td>\n",
       "      <td>b'811 53rd st'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11220'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>502</th>\n",
       "      <td>2</td>\n",
       "      <td>0.999633</td>\n",
       "      <td>Laojie Hotpot</td>\n",
       "      <td>811 53rd St</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11220</td>\n",
       "      <td>b'2'</td>\n",
       "      <td>b'times stamps hot pot'</td>\n",
       "      <td>b'811 53rd st'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11220'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>967</th>\n",
       "      <td>3</td>\n",
       "      <td>0.998985</td>\n",
       "      <td>Express Soft Taco</td>\n",
       "      <td>13408 Jamaica Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11418</td>\n",
       "      <td>b'3'</td>\n",
       "      <td>b'taco tortilleria incorporated'</td>\n",
       "      <td>b'13408 jamaica ave'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11418'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0.998985</td>\n",
       "      <td>Taco Tortilleria Incorporated</td>\n",
       "      <td>13408 Jamaica Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11418</td>\n",
       "      <td>b'3'</td>\n",
       "      <td>b'taco tortilleria incorporated'</td>\n",
       "      <td>b'13408 jamaica ave'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11418'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>912</th>\n",
       "      <td>4</td>\n",
       "      <td>0.999783</td>\n",
       "      <td>Hom</td>\n",
       "      <td>8810 3rd Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11209</td>\n",
       "      <td>b'4'</td>\n",
       "      <td>b'hom bay ridge'</td>\n",
       "      <td>b'8810 3rd ave'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11209'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>0.999783</td>\n",
       "      <td>HoM Bay Ridge</td>\n",
       "      <td>8810 3rd Ave</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "      <td>11209</td>\n",
       "      <td>b'4'</td>\n",
       "      <td>b'hom bay ridge'</td>\n",
       "      <td>b'8810 3rd ave'</td>\n",
       "      <td>b'new york'</td>\n",
       "      <td>b'ny'</td>\n",
       "      <td>b'11209'</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Cluster ID  confidence_score                           Name  \\\n",
       "Id                                                                 \n",
       "0             0          0.999961          J & L Groceries Store   \n",
       "246           0          0.999961                   J and L Deli   \n",
       "1             1          0.999889                            Lua   \n",
       "828           1          0.999889                        Lua Bar   \n",
       "2             2          0.999633           Times Stamps Hot Pot   \n",
       "502           2          0.999633                  Laojie Hotpot   \n",
       "967           3          0.998985              Express Soft Taco   \n",
       "3             3          0.998985  Taco Tortilleria Incorporated   \n",
       "912           4          0.999783                            Hom   \n",
       "4             4          0.999783                  HoM Bay Ridge   \n",
       "\n",
       "               Address      City State    Zip canonical_Id  \\\n",
       "Id                                                           \n",
       "0          682 5th Ave  New York    NY  11215         b'0'   \n",
       "246        682 5th Ave  New York    NY  11215         b'0'   \n",
       "1    1006 Flushing Ave  New York    NY  11237         b'1'   \n",
       "828  1006 Flushing Ave  New York    NY  11237         b'1'   \n",
       "2          811 53rd St  New York    NY  11220         b'2'   \n",
       "502        811 53rd St  New York    NY  11220         b'2'   \n",
       "967  13408 Jamaica Ave  New York    NY  11418         b'3'   \n",
       "3    13408 Jamaica Ave  New York    NY  11418         b'3'   \n",
       "912       8810 3rd Ave  New York    NY  11209         b'4'   \n",
       "4         8810 3rd Ave  New York    NY  11209         b'4'   \n",
       "\n",
       "                       canonical_Name     canonical_Address canonical_City  \\\n",
       "Id                                                                           \n",
       "0            b'j & l groceries store'        b'682 5th ave'    b'new york'   \n",
       "246          b'j & l groceries store'        b'682 5th ave'    b'new york'   \n",
       "1                              b'lua'  b'1006 flushing ave'    b'new york'   \n",
       "828                            b'lua'  b'1006 flushing ave'    b'new york'   \n",
       "2             b'times stamps hot pot'        b'811 53rd st'    b'new york'   \n",
       "502           b'times stamps hot pot'        b'811 53rd st'    b'new york'   \n",
       "967  b'taco tortilleria incorporated'  b'13408 jamaica ave'    b'new york'   \n",
       "3    b'taco tortilleria incorporated'  b'13408 jamaica ave'    b'new york'   \n",
       "912                  b'hom bay ridge'       b'8810 3rd ave'    b'new york'   \n",
       "4                    b'hom bay ridge'       b'8810 3rd ave'    b'new york'   \n",
       "\n",
       "    canonical_State canonical_Zip  \n",
       "Id                                 \n",
       "0             b'ny'      b'11215'  \n",
       "246           b'ny'      b'11215'  \n",
       "1             b'ny'      b'11237'  \n",
       "828           b'ny'      b'11237'  \n",
       "2             b'ny'      b'11220'  \n",
       "502           b'ny'      b'11220'  \n",
       "967           b'ny'      b'11418'  \n",
       "3             b'ny'      b'11418'  \n",
       "912           b'ny'      b'11209'  \n",
       "4             b'ny'      b'11209'  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('csv_example/restaurants_out.csv', index_col='Id')\n",
    "df = df.sort_values(['Cluster ID'])\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}