Last active
June 19, 2016 15:24
-
-
Save pramsey/d3a0a8e9d1c2e6920189320c4adc8f8f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
This code demonstrates how to use dedupe with to match messy records | |
against a deduplicated, canonical dataset. In this example, we'll be | |
matching voters list strings against a list of valid adddresses | |
""" | |
import os | |
import csv | |
import re | |
import logging | |
import optparse | |
from numpy import nan | |
import dedupe | |
import unidecode | |
import usaddress | |
from pprint import pprint | |
# ## Logging | |
# Dedupe uses Python logging to show or suppress verbose output. Added for convenience. | |
# To enable verbose logging, run `python examples/csv_example/csv_example.py -v` | |
optp = optparse.OptionParser() | |
optp.add_option('-v', '--verbose', dest='verbose', action='count', | |
help='Increase verbosity (specify multiple times for more)' | |
) | |
(opts, args) = optp.parse_args() | |
log_level = logging.WARNING | |
if opts.verbose == 1: | |
log_level = logging.INFO | |
elif opts.verbose >= 2: | |
log_level = logging.DEBUG | |
logging.getLogger().setLevel(log_level) | |
def preproc(column): | |
""" | |
Do a little bit of data cleaning with the help of Unidecode and Regex. | |
Things like casing, extra spaces, quotes and new lines can be ignored. | |
""" | |
column = column.replace('-', '').replace('/', ' ').replace("'", '').replace(',','') | |
column = column.replace(':', ' ').upper().strip() | |
if not column : | |
column = None | |
return column | |
def voters_csv_reader(unicode_csv_data): | |
csv_columns = ('RIDING', 'POLL', 'VOTER_ID', 'LAST_NAME', 'LAST_NAME_NORMALIZED', | |
'GIVEN_NAMES', 'STREET_NAME', 'STREET_NAME_NORMALIZED', 'STREET_TYPE', | |
'STREET_DIR_SUFFIX', 'STREET_NUMBER', 'STREET_NUMBER_SUFFIX', 'UNIT_NUMBER', | |
'VOTER_NUMBER', 'MISC1', 'LOCALITY', 'MISC2', 'POSTAL_CODE', | |
'POSTAL_ADDRESS_1', 'POSTAL_ADDRESS_2', 'POSTAL_LOCALITY', 'MISC3') | |
csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data), | |
fieldnames=csv_columns, delimiter=',', quotechar='"') | |
for row in csv_reader: | |
# decode UTF-8 back to Unicode, cell by cell: | |
yield {k : preproc(unicode(v, 'utf-8')) if v else None for k, v in row.items()} | |
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs): | |
# csv.py doesn't do Unicode; encode temporarily as UTF-8: | |
csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data), | |
dialect=dialect, **kwargs) | |
for row in csv_reader: | |
# decode UTF-8 back to Unicode, cell by cell: | |
yield {k : preproc(unicode(v, 'utf-8')) if v else None for k, v in row.items()} | |
def utf_8_encoder(unicode_csv_data): | |
for line in unicode_csv_data: | |
yield line.decode('utf-8').encode('utf-8') | |
def readAddressData(input_file): | |
data = {} | |
with open(input_file) as f: | |
reader = unicode_csv_reader(f, delimiter="\t") | |
for row in reader: | |
# pprint(row) | |
# exit(1) | |
data[row['ADDRESS_ID']] = row | |
return data | |
def readVoterData(input_file): | |
data = {} | |
with open(input_file) as f: | |
reader = voters_csv_reader(f) | |
for row in reader: | |
data[row['VOTER_ID']] = row | |
return data | |
# ## Setup | |
output_file = 'address_matching_output.csv' | |
settings_file = 'address_matching_learned_settings' | |
training_file = 'address_matching_training.json' | |
address_file = 'addresses.txt' | |
voter_file = 'PPABM201304R.csv' | |
print 'importing voter data ...' | |
messy_addresses = readVoterData(voter_file) | |
print 'importing address data ...' | |
canonical_addresses = readAddressData(address_file) | |
# ## Training | |
if os.path.exists(settings_file): | |
print 'reading from', settings_file | |
with open(settings_file) as sf : | |
linker = dedupe.StaticGazetteer(sf, num_cores=2) | |
else: | |
# Define the fields dedupe will pay attention to | |
fields = [ | |
{'field':'STREET_NAME', 'type':'String'}, | |
{'field':'STREET_TYPE', 'type':'Exact', 'has missing':True}, | |
{'field':'STREET_DIR_SUFFIX', 'type':'ShortString', 'has missing':True}, | |
{'field':'STREET_NUMBER', 'type':'Exact', 'has missing':True}, | |
{'field':'STREET_NUMBER_SUFFIX', 'type':'ShortString', 'has missing':True}, | |
{'field':'LOCALITY', 'type':'String'} | |
] | |
# Create a new linker object and pass our data model to it. | |
linker = dedupe.Gazetteer(fields, num_cores=2) | |
# To train dedupe, we feed it a random sample of records. | |
linker.sample(messy_addresses, canonical_addresses, 3000) | |
if os.path.exists(training_file): | |
print 'reading labeled examples from ', training_file | |
with open(training_file) as tf : | |
linker.readTraining(tf) | |
dedupe.consoleLabel(linker) | |
linker.train() | |
# When finished, save our training away to disk | |
with open(training_file, 'w') as tf : | |
linker.writeTraining(tf) | |
# Save our weights and predicates to disk. If the settings file | |
# exists, we will skip all the training and learning next time we run | |
# this file. | |
with open(settings_file, 'w') as sf : | |
linker.writeSettings(sf) | |
linker.cleanupTraining() | |
print 'indexing' | |
linker.index(canonical_addresses) | |
clustered_dupes = [] | |
print 'clustering...' | |
clustered_dupes = linker.match(messy_addresses, 0.0) | |
print '# duplicate sets', len(clustered_dupes) | |
print 'out of', len(messy_addresses) | |
canonical_lookup = {} | |
for n_results in clustered_dupes : | |
(source_id, target_id), score = n_results[0] | |
canonical_lookup[source_id] = (target_id, score) | |
with open(output_file, 'w') as f: | |
writer = csv.writer(f) | |
writer.writerow(['address_id', 'voter_id', | |
'Score', 'x_coord', 'y_coord']) | |
for record_id, record in messy_addresses.items() : | |
row = [record['VOTER_ID'], '', '', '', ''] | |
if record_id in canonical_lookup : | |
canonical_id, score = canonical_lookup[record_id] | |
row[1] = record['POSTAL_ADDRESS_1'] | |
row[2] = score | |
row[3] = canonical_id | |
row[4] = canonical_addresses[canonical_id]['FULL_ADDRESS'] | |
writer.writerow(row) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
STREET_NAME : 12TH | |
STREET_TYPE : AVE | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 32921 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
STREET_NAME : 12TH | |
STREET_TYPE : AVE | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 7828 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : BURNABY | |
0/10 positive, 0/10 negative | |
Do these records refer to the same thing? | |
(y)es / (n)o / (u)nsure / (f)inished | |
n | |
STREET_NAME : CHERRY | |
STREET_TYPE : AVE | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 33762 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
STREET_NAME : CHERRY | |
STREET_TYPE : AVE | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 32598 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
0/10 positive, 1/10 negative | |
Do these records refer to the same thing? | |
(y)es / (n)o / (u)nsure / (f)inished | |
n | |
STREET_NAME : DEWDNEY TRUNK | |
STREET_TYPE : RD | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 9131 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
STREET_NAME : HWY | |
STREET_TYPE : HWY | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 9131 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : POWELL RIVER REGIONAL DISTRICT | |
0/10 positive, 2/10 negative | |
Do these records refer to the same thing? | |
(y)es / (n)o / (u)nsure / (f)inished | |
n | |
STREET_NAME : LETHBRIDGE | |
STREET_TYPE : DR | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 3788 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : ABBOTSFORD | |
STREET_NAME : None | |
STREET_TYPE : AVE | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 29457 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : ABBOTSFORD | |
0/10 positive, 3/10 negative | |
Do these records refer to the same thing? | |
(y)es / (n)o / (u)nsure / (f)inished | |
n | |
STREET_NAME : HARMS | |
STREET_TYPE : ST | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 8493 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
STREET_NAME : BAILEY | |
STREET_TYPE : PL | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 8493 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
0/10 positive, 4/10 negative | |
Do these records refer to the same thing? | |
(y)es / (n)o / (u)nsure / (f)inished | |
n | |
STREET_NAME : 4TH | |
STREET_TYPE : AVE | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 33150 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
STREET_NAME : 4TH | |
STREET_TYPE : AVE | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 32866 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
0/10 positive, 5/10 negative | |
Do these records refer to the same thing? | |
(y)es / (n)o / (u)nsure / (f)inished | |
n | |
STREET_NAME : OLD YALE | |
STREET_TYPE : RD | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 34909 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : ABBOTSFORD | |
STREET_NAME : OLD YALE | |
STREET_TYPE : RD | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 38691 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : ABBOTSFORD | |
0/10 positive, 6/10 negative | |
Do these records refer to the same thing? | |
(y)es / (n)o / (u)nsure / (f)inished | |
n | |
STREET_NAME : 3RD | |
STREET_TYPE : AVE | |
STREET_DIR_SUFFIX : None | |
STREET_NUMBER : 32904 | |
STREET_NUMBER_SUFFIX : None | |
LOCALITY : MISSION | |
... ... mOre... | |
0/10 positive, 38/10 negative | |
Do these records refer to the same thing? | |
(y)es / (n)o / (u)nsure / (f)inished | |
f | |
Finished labeling | |
Traceback (most recent call last): | |
File "match_add_voters.py", line 147, in <module> | |
linker.train() | |
File "/Library/Python/2.7/site-packages/dedupe/api.py", line 735, in train | |
index_predicates) | |
File "/Library/Python/2.7/site-packages/dedupe/api.py", line 760, in _trainBlocker | |
recall) | |
File "/Library/Python/2.7/site-packages/dedupe/training.py", line 80, in learn | |
raise ValueError(NO_PREDICATES_ERROR) | |
ValueError: No predicate found! We could not learn a single good predicate. Maybe give Dedupe more training data or increasing the `max_comparisons` argument to the train method |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment