Skip to content

Instantly share code, notes, and snippets.

@pramsey
Last active June 19, 2016 15:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pramsey/d3a0a8e9d1c2e6920189320c4adc8f8f to your computer and use it in GitHub Desktop.
Save pramsey/d3a0a8e9d1c2e6920189320c4adc8f8f to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with to match messy records
against a deduplicated, canonical dataset. In this example, we'll be
matching voters list strings against a list of valid adddresses
"""
import os
import csv
import re
import logging
import optparse
from numpy import nan
import dedupe
import unidecode
import usaddress
from pprint import pprint
# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added for convenience.
# To enable verbose logging, run `python examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)
def preproc(column):
"""
Do a little bit of data cleaning with the help of Unidecode and Regex.
Things like casing, extra spaces, quotes and new lines can be ignored.
"""
column = column.replace('-', '').replace('/', ' ').replace("'", '').replace(',','')
column = column.replace(':', ' ').upper().strip()
if not column :
column = None
return column
def voters_csv_reader(unicode_csv_data):
csv_columns = ('RIDING', 'POLL', 'VOTER_ID', 'LAST_NAME', 'LAST_NAME_NORMALIZED',
'GIVEN_NAMES', 'STREET_NAME', 'STREET_NAME_NORMALIZED', 'STREET_TYPE',
'STREET_DIR_SUFFIX', 'STREET_NUMBER', 'STREET_NUMBER_SUFFIX', 'UNIT_NUMBER',
'VOTER_NUMBER', 'MISC1', 'LOCALITY', 'MISC2', 'POSTAL_CODE',
'POSTAL_ADDRESS_1', 'POSTAL_ADDRESS_2', 'POSTAL_LOCALITY', 'MISC3')
csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data),
fieldnames=csv_columns, delimiter=',', quotechar='"')
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield {k : preproc(unicode(v, 'utf-8')) if v else None for k, v in row.items()}
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data),
dialect=dialect, **kwargs)
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield {k : preproc(unicode(v, 'utf-8')) if v else None for k, v in row.items()}
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.decode('utf-8').encode('utf-8')
def readAddressData(input_file):
data = {}
with open(input_file) as f:
reader = unicode_csv_reader(f, delimiter="\t")
for row in reader:
# pprint(row)
# exit(1)
data[row['ADDRESS_ID']] = row
return data
def readVoterData(input_file):
data = {}
with open(input_file) as f:
reader = voters_csv_reader(f)
for row in reader:
data[row['VOTER_ID']] = row
return data
# ## Setup
output_file = 'address_matching_output.csv'
settings_file = 'address_matching_learned_settings'
training_file = 'address_matching_training.json'
address_file = 'addresses.txt'
voter_file = 'PPABM201304R.csv'
print 'importing voter data ...'
messy_addresses = readVoterData(voter_file)
print 'importing address data ...'
canonical_addresses = readAddressData(address_file)
# ## Training
if os.path.exists(settings_file):
print 'reading from', settings_file
with open(settings_file) as sf :
linker = dedupe.StaticGazetteer(sf, num_cores=2)
else:
# Define the fields dedupe will pay attention to
fields = [
{'field':'STREET_NAME', 'type':'String'},
{'field':'STREET_TYPE', 'type':'Exact', 'has missing':True},
{'field':'STREET_DIR_SUFFIX', 'type':'ShortString', 'has missing':True},
{'field':'STREET_NUMBER', 'type':'Exact', 'has missing':True},
{'field':'STREET_NUMBER_SUFFIX', 'type':'ShortString', 'has missing':True},
{'field':'LOCALITY', 'type':'String'}
]
# Create a new linker object and pass our data model to it.
linker = dedupe.Gazetteer(fields, num_cores=2)
# To train dedupe, we feed it a random sample of records.
linker.sample(messy_addresses, canonical_addresses, 3000)
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
with open(training_file) as tf :
linker.readTraining(tf)
dedupe.consoleLabel(linker)
linker.train()
# When finished, save our training away to disk
with open(training_file, 'w') as tf :
linker.writeTraining(tf)
# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
with open(settings_file, 'w') as sf :
linker.writeSettings(sf)
linker.cleanupTraining()
print 'indexing'
linker.index(canonical_addresses)
clustered_dupes = []
print 'clustering...'
clustered_dupes = linker.match(messy_addresses, 0.0)
print '# duplicate sets', len(clustered_dupes)
print 'out of', len(messy_addresses)
canonical_lookup = {}
for n_results in clustered_dupes :
(source_id, target_id), score = n_results[0]
canonical_lookup[source_id] = (target_id, score)
with open(output_file, 'w') as f:
writer = csv.writer(f)
writer.writerow(['address_id', 'voter_id',
'Score', 'x_coord', 'y_coord'])
for record_id, record in messy_addresses.items() :
row = [record['VOTER_ID'], '', '', '', '']
if record_id in canonical_lookup :
canonical_id, score = canonical_lookup[record_id]
row[1] = record['POSTAL_ADDRESS_1']
row[2] = score
row[3] = canonical_id
row[4] = canonical_addresses[canonical_id]['FULL_ADDRESS']
writer.writerow(row)
STREET_NAME : 12TH
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 32921
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
STREET_NAME : 12TH
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 7828
STREET_NUMBER_SUFFIX : None
LOCALITY : BURNABY
0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : CHERRY
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 33762
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
STREET_NAME : CHERRY
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 32598
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : DEWDNEY TRUNK
STREET_TYPE : RD
STREET_DIR_SUFFIX : None
STREET_NUMBER : 9131
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
STREET_NAME : HWY
STREET_TYPE : HWY
STREET_DIR_SUFFIX : None
STREET_NUMBER : 9131
STREET_NUMBER_SUFFIX : None
LOCALITY : POWELL RIVER REGIONAL DISTRICT
0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : LETHBRIDGE
STREET_TYPE : DR
STREET_DIR_SUFFIX : None
STREET_NUMBER : 3788
STREET_NUMBER_SUFFIX : None
LOCALITY : ABBOTSFORD
STREET_NAME : None
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 29457
STREET_NUMBER_SUFFIX : None
LOCALITY : ABBOTSFORD
0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : HARMS
STREET_TYPE : ST
STREET_DIR_SUFFIX : None
STREET_NUMBER : 8493
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
STREET_NAME : BAILEY
STREET_TYPE : PL
STREET_DIR_SUFFIX : None
STREET_NUMBER : 8493
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
0/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : 4TH
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 33150
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
STREET_NAME : 4TH
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 32866
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
0/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : OLD YALE
STREET_TYPE : RD
STREET_DIR_SUFFIX : None
STREET_NUMBER : 34909
STREET_NUMBER_SUFFIX : None
LOCALITY : ABBOTSFORD
STREET_NAME : OLD YALE
STREET_TYPE : RD
STREET_DIR_SUFFIX : None
STREET_NUMBER : 38691
STREET_NUMBER_SUFFIX : None
LOCALITY : ABBOTSFORD
0/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : 3RD
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 32904
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION
... ... mOre...
0/10 positive, 38/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
f
Finished labeling
Traceback (most recent call last):
File "match_add_voters.py", line 147, in <module>
linker.train()
File "/Library/Python/2.7/site-packages/dedupe/api.py", line 735, in train
index_predicates)
File "/Library/Python/2.7/site-packages/dedupe/api.py", line 760, in _trainBlocker
recall)
File "/Library/Python/2.7/site-packages/dedupe/training.py", line 80, in learn
raise ValueError(NO_PREDICATES_ERROR)
ValueError: No predicate found! We could not learn a single good predicate. Maybe give Dedupe more training data or increasing the `max_comparisons` argument to the train method
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment