Created
April 13, 2015 21:21
-
-
Save evz/4c5b055bef94aed5d526 to your computer and use it in GitHub Desktop.
address_matcher.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
This code demonstrates how to use dedupe with to match messy records | |
against a deduplicated, canonical dataset. In this example, we'll be | |
matching messy address strings against a list of valid adddresses in | |
Chicago. | |
""" | |
import os | |
import csv | |
import re | |
import logging | |
import optparse | |
from numpy import nan | |
import dedupe | |
import unidecode | |
import usaddress | |
# ## Logging | |
# Dedupe uses Python logging to show or suppress verbose output. Added for convenience. | |
# To enable verbose logging, run `python examples/csv_example/csv_example.py -v` | |
optp = optparse.OptionParser() | |
optp.add_option('-v', '--verbose', dest='verbose', action='count', | |
help='Increase verbosity (specify multiple times for more)' | |
) | |
(opts, args) = optp.parse_args() | |
log_level = logging.WARNING | |
if opts.verbose == 1: | |
log_level = logging.INFO | |
elif opts.verbose >= 2: | |
log_level = logging.DEBUG | |
logging.basicConfig(level=log_level) | |
def preProcess(column): | |
""" | |
Do a little bit of data cleaning with the help of [AsciiDammit](https://github.com/tnajdek/ASCII--Dammit) | |
and Regex. Things like casing, extra spaces, quotes and new lines can be ignored. | |
""" | |
column = unidecode.unidecode(column) | |
column = re.sub('\n', ' ', column) | |
column = re.sub('-', '', column) | |
column = re.sub('/', ' ', column) | |
column = re.sub("'", '', column) | |
column = re.sub(",", '', column) | |
column = re.sub(":", ' ', column) | |
column = re.sub(' +', ' ', column) | |
column = column.strip().strip('"').strip("'").lower().strip() | |
return column | |
def readData(input_file): | |
""" | |
The data we'll be matching against are address strings. We'll | |
use the python-streetaddress library to attempt to parse the | |
string into meaningful subcomponents. | |
""" | |
data = {} | |
with open(input_file) as f: | |
reader = csv.DictReader(f) | |
for i, row in enumerate(reader): | |
clean_row = {'original_address' : row.get('cmpaddabrv')} | |
clean_row['cmpaddabrv'] = row['cmpaddabrv'] | |
if clean_row : | |
data[input_file + str(i)] = clean_row | |
return data | |
# ## Setup | |
output_file = 'address_matching_output.csv' | |
settings_file = 'address_matching_learned_settings' | |
training_file = 'address_matching_training.json' | |
canonical_file = 'data/just_addresses.csv' | |
messy_file = 'addresses.csv' | |
print 'importing data ...' | |
messy_addresses = readData(messy_file) | |
canonical_addresses = readData(canonical_file) | |
# ## Training | |
if os.path.exists(settings_file): | |
print 'reading from', settings_file | |
with open(settings_file) as sf : | |
linker = dedupe.StaticGazetteer(sf, num_cores=2) | |
else: | |
# Define the fields dedupe will pay attention to | |
# | |
# Notice how we are telling dedupe to use a custom field comparator | |
# for the 'Zip' field. | |
fields = [{'field' : 'cmpaddabrv', 'type' : 'Address'},] | |
# Create a new linker object and pass our data model to it. | |
linker = dedupe.Gazetteer(fields) | |
# To train dedupe, we feed it a random sample of records. | |
linker.sample(messy_addresses, canonical_addresses, 3000) | |
if os.path.exists(training_file): | |
print 'reading labeled examples from ', training_file | |
with open(training_file) as tf : | |
linker.readTraining(tf) | |
dedupe.consoleLabel(linker) | |
linker.train() | |
# When finished, save our training away to disk | |
with open(training_file, 'w') as tf : | |
linker.writeTraining(tf) | |
# Save our weights and predicates to disk. If the settings file | |
# exists, we will skip all the training and learning next time we run | |
# this file. | |
with open(settings_file, 'w') as sf : | |
linker.writeSettings(sf) | |
linker.cleanupTraining() | |
print 'indexing' | |
linker.index(canonical_addresses) | |
clustered_dupes = [] | |
print 'clustering...' | |
clustered_dupes = linker.match(messy_addresses, 0.0) | |
print '# duplicate sets', len(clustered_dupes) | |
print 'out of', len(messy_addresses) | |
canonical_lookup = {} | |
for n_results in clustered_dupes : | |
(source_id, target_id), score = n_results[0] | |
canonical_lookup[source_id] = (target_id, score) | |
with open(output_file, 'w') as f: | |
writer = csv.writer(f) | |
writer.writerow(['Messy Address', 'Canonical Address', | |
'Score', 'x_coord', 'y_coord', 'Original Address']) | |
for record_id, record in messy_addresses.items() : | |
row = [record['cmpaddabrv'], '', '', '', '', ''] | |
if record_id in canonical_lookup : | |
canonical_id, score = canonical_lookup[record_id] | |
row[1] = canonical_addresses[canonical_id]['cmpaddabrv'] | |
row[2] = score | |
row[3] = canonical_addresses[canonical_id]['longitude'] | |
row[4] = canonical_addresses[canonical_id]['latitude'] | |
row[5] = record['original_address'] | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment