Skip to content

Instantly share code, notes, and snippets.

@evz
Created April 13, 2015 21:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save evz/4c5b055bef94aed5d526 to your computer and use it in GitHub Desktop.
Save evz/4c5b055bef94aed5d526 to your computer and use it in GitHub Desktop.
address_matcher.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with to match messy records
against a deduplicated, canonical dataset. In this example, we'll be
matching messy address strings against a list of valid adddresses in
Chicago.
"""
import os
import csv
import re
import logging
import optparse
from numpy import nan
import dedupe
import unidecode
import usaddress
# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added for convenience.
# To enable verbose logging, run `python examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
logging.basicConfig(level=log_level)
def preProcess(column):
"""
Do a little bit of data cleaning with the help of [AsciiDammit](https://github.com/tnajdek/ASCII--Dammit)
and Regex. Things like casing, extra spaces, quotes and new lines can be ignored.
"""
column = unidecode.unidecode(column)
column = re.sub('\n', ' ', column)
column = re.sub('-', '', column)
column = re.sub('/', ' ', column)
column = re.sub("'", '', column)
column = re.sub(",", '', column)
column = re.sub(":", ' ', column)
column = re.sub(' +', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()
return column
def readData(input_file):
"""
The data we'll be matching against are address strings. We'll
use the python-streetaddress library to attempt to parse the
string into meaningful subcomponents.
"""
data = {}
with open(input_file) as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
clean_row = {'original_address' : row.get('cmpaddabrv')}
clean_row['cmpaddabrv'] = row['cmpaddabrv']
if clean_row :
data[input_file + str(i)] = clean_row
return data
# ## Setup
output_file = 'address_matching_output.csv'
settings_file = 'address_matching_learned_settings'
training_file = 'address_matching_training.json'
canonical_file = 'data/just_addresses.csv'
messy_file = 'addresses.csv'
print 'importing data ...'
messy_addresses = readData(messy_file)
canonical_addresses = readData(canonical_file)
# ## Training
if os.path.exists(settings_file):
print 'reading from', settings_file
with open(settings_file) as sf :
linker = dedupe.StaticGazetteer(sf, num_cores=2)
else:
# Define the fields dedupe will pay attention to
#
# Notice how we are telling dedupe to use a custom field comparator
# for the 'Zip' field.
fields = [{'field' : 'cmpaddabrv', 'type' : 'Address'},]
# Create a new linker object and pass our data model to it.
linker = dedupe.Gazetteer(fields)
# To train dedupe, we feed it a random sample of records.
linker.sample(messy_addresses, canonical_addresses, 3000)
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
with open(training_file) as tf :
linker.readTraining(tf)
dedupe.consoleLabel(linker)
linker.train()
# When finished, save our training away to disk
with open(training_file, 'w') as tf :
linker.writeTraining(tf)
# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
with open(settings_file, 'w') as sf :
linker.writeSettings(sf)
linker.cleanupTraining()
print 'indexing'
linker.index(canonical_addresses)
clustered_dupes = []
print 'clustering...'
clustered_dupes = linker.match(messy_addresses, 0.0)
print '# duplicate sets', len(clustered_dupes)
print 'out of', len(messy_addresses)
canonical_lookup = {}
for n_results in clustered_dupes :
(source_id, target_id), score = n_results[0]
canonical_lookup[source_id] = (target_id, score)
with open(output_file, 'w') as f:
writer = csv.writer(f)
writer.writerow(['Messy Address', 'Canonical Address',
'Score', 'x_coord', 'y_coord', 'Original Address'])
for record_id, record in messy_addresses.items() :
row = [record['cmpaddabrv'], '', '', '', '', '']
if record_id in canonical_lookup :
canonical_id, score = canonical_lookup[record_id]
row[1] = canonical_addresses[canonical_id]['cmpaddabrv']
row[2] = score
row[3] = canonical_addresses[canonical_id]['longitude']
row[4] = canonical_addresses[canonical_id]['latitude']
row[5] = record['original_address']
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment