pramsey/match_address_voters.py

## match_address_voters.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with to match messy records
against a deduplicated, canonical dataset. In this example, we'll be
matching voters list strings against a list of valid adddresses
"""

import os
import csv
import re
import logging
import optparse
from numpy import nan

import dedupe
import unidecode
import usaddress
from pprint import pprint

# ## Logging

# Dedupe uses Python logging to show or suppress verbose output. Added for convenience.
# To enable verbose logging, run `python examples/csv_example/csv_example.py -v`

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose == 1:
    log_level = logging.INFO
elif opts.verbose >= 2:
    log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)


def preproc(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = column.replace('-', '').replace('/', ' ').replace("'", '').replace(',','')
    column = column.replace(':', ' ').upper().strip()
    if not column :
        column = None
    return column


def voters_csv_reader(unicode_csv_data):
    csv_columns = ('RIDING', 'POLL', 'VOTER_ID', 'LAST_NAME', 'LAST_NAME_NORMALIZED',
        'GIVEN_NAMES', 'STREET_NAME', 'STREET_NAME_NORMALIZED', 'STREET_TYPE',
        'STREET_DIR_SUFFIX', 'STREET_NUMBER', 'STREET_NUMBER_SUFFIX', 'UNIT_NUMBER',
        'VOTER_NUMBER', 'MISC1', 'LOCALITY', 'MISC2', 'POSTAL_CODE',
        'POSTAL_ADDRESS_1', 'POSTAL_ADDRESS_2', 'POSTAL_LOCALITY', 'MISC3')

    csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data),
                    fieldnames=csv_columns, delimiter=',', quotechar='"')

    for row in csv_reader:
    # decode UTF-8 back to Unicode, cell by cell:
        yield {k : preproc(unicode(v, 'utf-8')) if v else None for k, v in row.items()}


def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
    # csv.py doesn't do Unicode; encode temporarily as UTF-8:
    csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data),
                                dialect=dialect, **kwargs)
    for row in csv_reader:
        # decode UTF-8 back to Unicode, cell by cell:
        yield {k : preproc(unicode(v, 'utf-8')) if v else None for k, v in row.items()}


def utf_8_encoder(unicode_csv_data):
    for line in unicode_csv_data:
        yield line.decode('utf-8').encode('utf-8')


def readAddressData(input_file):

    data = {}
    with open(input_file) as f:
        reader = unicode_csv_reader(f, delimiter="\t")
        for row in reader:
            # pprint(row)
            # exit(1)
            data[row['ADDRESS_ID']] = row

    return data

def readVoterData(input_file):

    data = {}
    with open(input_file) as f:
        reader = voters_csv_reader(f)
        for row in reader:
            data[row['VOTER_ID']] = row

    return data


# ## Setup
output_file = 'address_matching_output.csv'
settings_file = 'address_matching_learned_settings'
training_file = 'address_matching_training.json'
address_file = 'addresses.txt'
voter_file = 'PPABM201304R.csv'


print 'importing voter data ...'
messy_addresses = readVoterData(voter_file)
print 'importing address data ...'
canonical_addresses = readAddressData(address_file)

# ## Training
if os.path.exists(settings_file):
    print 'reading from', settings_file
    with open(settings_file) as sf :
        linker = dedupe.StaticGazetteer(sf, num_cores=2)

else:
    # Define the fields dedupe will pay attention to
    fields = [
        {'field':'STREET_NAME', 'type':'String'},
        {'field':'STREET_TYPE', 'type':'Exact', 'has missing':True},
        {'field':'STREET_DIR_SUFFIX', 'type':'ShortString', 'has missing':True},
        {'field':'STREET_NUMBER', 'type':'Exact', 'has missing':True},
        {'field':'STREET_NUMBER_SUFFIX', 'type':'ShortString', 'has missing':True},
        {'field':'LOCALITY', 'type':'String'}
    ]

    # Create a new linker object and pass our data model to it.
    linker = dedupe.Gazetteer(fields, num_cores=2)
    # To train dedupe, we feed it a random sample of records.
    linker.sample(messy_addresses, canonical_addresses, 3000)

    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        with open(training_file) as tf :
            linker.readTraining(tf)


    dedupe.consoleLabel(linker)
    linker.train()

    # When finished, save our training away to disk
    with open(training_file, 'w') as tf :
        linker.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'w') as sf :
        linker.writeSettings(sf)

    linker.cleanupTraining()

print 'indexing'
linker.index(canonical_addresses)

clustered_dupes = []

print 'clustering...'
clustered_dupes = linker.match(messy_addresses, 0.0)

print '# duplicate sets', len(clustered_dupes)
print 'out of', len(messy_addresses)

canonical_lookup = {}
for n_results in clustered_dupes :
    (source_id, target_id), score = n_results[0]
    canonical_lookup[source_id] = (target_id, score)

with open(output_file, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['address_id', 'voter_id',
                     'Score', 'x_coord', 'y_coord'])

    for record_id, record in messy_addresses.items() :
        row = [record['VOTER_ID'], '', '', '', '']
        if record_id in canonical_lookup :
            canonical_id, score = canonical_lookup[record_id]
            row[1] = record['POSTAL_ADDRESS_1']
            row[2] = score
            row[3] = canonical_id
            row[4] = canonical_addresses[canonical_id]['FULL_ADDRESS']
        writer.writerow(row)

## training.txt
STREET_NAME : 12TH
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 32921
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

STREET_NAME : 12TH
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 7828
STREET_NUMBER_SUFFIX : None
LOCALITY : BURNABY

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : CHERRY
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 33762
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

STREET_NAME : CHERRY
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 32598
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : DEWDNEY TRUNK
STREET_TYPE : RD
STREET_DIR_SUFFIX : None
STREET_NUMBER : 9131
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

STREET_NAME : HWY
STREET_TYPE : HWY
STREET_DIR_SUFFIX : None
STREET_NUMBER : 9131
STREET_NUMBER_SUFFIX : None
LOCALITY : POWELL RIVER REGIONAL DISTRICT

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : LETHBRIDGE
STREET_TYPE : DR
STREET_DIR_SUFFIX : None
STREET_NUMBER : 3788
STREET_NUMBER_SUFFIX : None
LOCALITY : ABBOTSFORD

STREET_NAME : None
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 29457
STREET_NUMBER_SUFFIX : None
LOCALITY : ABBOTSFORD

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : HARMS
STREET_TYPE : ST
STREET_DIR_SUFFIX : None
STREET_NUMBER : 8493
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

STREET_NAME : BAILEY
STREET_TYPE : PL
STREET_DIR_SUFFIX : None
STREET_NUMBER : 8493
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

0/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : 4TH
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 33150
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

STREET_NAME : 4TH
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 32866
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

0/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : OLD YALE
STREET_TYPE : RD
STREET_DIR_SUFFIX : None
STREET_NUMBER : 34909
STREET_NUMBER_SUFFIX : None
LOCALITY : ABBOTSFORD

STREET_NAME : OLD YALE
STREET_TYPE : RD
STREET_DIR_SUFFIX : None
STREET_NUMBER : 38691
STREET_NUMBER_SUFFIX : None
LOCALITY : ABBOTSFORD

0/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
STREET_NAME : 3RD
STREET_TYPE : AVE
STREET_DIR_SUFFIX : None
STREET_NUMBER : 32904
STREET_NUMBER_SUFFIX : None
LOCALITY : MISSION

... ... mOre...

0/10 positive, 38/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
f
Finished labeling
Traceback (most recent call last):
  File "match_add_voters.py", line 147, in <module>
    linker.train()
  File "/Library/Python/2.7/site-packages/dedupe/api.py", line 735, in train
    index_predicates)
  File "/Library/Python/2.7/site-packages/dedupe/api.py", line 760, in _trainBlocker
    recall)
  File "/Library/Python/2.7/site-packages/dedupe/training.py", line 80, in learn
    raise ValueError(NO_PREDICATES_ERROR)
ValueError: No predicate found! We could not learn a single good predicate. Maybe give Dedupe more training data or increasing the `max_comparisons` argument to the train method
	#!/usr/bin/python
	# -- coding: utf-8 --
	"""
	This code demonstrates how to use dedupe with to match messy records
	against a deduplicated, canonical dataset. In this example, we'll be
	matching voters list strings against a list of valid adddresses
	"""

	import os
	import csv
	import re
	import logging
	import optparse
	from numpy import nan

	import dedupe
	import unidecode
	import usaddress
	from pprint import pprint

	# ## Logging

	# Dedupe uses Python logging to show or suppress verbose output. Added for convenience.
	# To enable verbose logging, run `python examples/csv_example/csv_example.py -v`

	optp = optparse.OptionParser()
	optp.add_option('-v', '--verbose', dest='verbose', action='count',
	help='Increase verbosity (specify multiple times for more)'
	)
	(opts, args) = optp.parse_args()
	log_level = logging.WARNING
	if opts.verbose == 1:
	log_level = logging.INFO
	elif opts.verbose >= 2:
	log_level = logging.DEBUG
	logging.getLogger().setLevel(log_level)


	def preproc(column):
	"""
	Do a little bit of data cleaning with the help of Unidecode and Regex.
	Things like casing, extra spaces, quotes and new lines can be ignored.
	"""

	column = column.replace('-', '').replace('/', ' ').replace("'", '').replace(',','')
	column = column.replace(':', ' ').upper().strip()
	if not column :
	column = None
	return column


	def voters_csv_reader(unicode_csv_data):
	csv_columns = ('RIDING', 'POLL', 'VOTER_ID', 'LAST_NAME', 'LAST_NAME_NORMALIZED',
	'GIVEN_NAMES', 'STREET_NAME', 'STREET_NAME_NORMALIZED', 'STREET_TYPE',
	'STREET_DIR_SUFFIX', 'STREET_NUMBER', 'STREET_NUMBER_SUFFIX', 'UNIT_NUMBER',
	'VOTER_NUMBER', 'MISC1', 'LOCALITY', 'MISC2', 'POSTAL_CODE',
	'POSTAL_ADDRESS_1', 'POSTAL_ADDRESS_2', 'POSTAL_LOCALITY', 'MISC3')

	csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data),
	fieldnames=csv_columns, delimiter=',', quotechar='"')

	for row in csv_reader:
	# decode UTF-8 back to Unicode, cell by cell:
	yield {k : preproc(unicode(v, 'utf-8')) if v else None for k, v in row.items()}


	def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
	# csv.py doesn't do Unicode; encode temporarily as UTF-8:
	csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data),
	dialect=dialect, **kwargs)
	for row in csv_reader:
	# decode UTF-8 back to Unicode, cell by cell:
	yield {k : preproc(unicode(v, 'utf-8')) if v else None for k, v in row.items()}


	def utf_8_encoder(unicode_csv_data):
	for line in unicode_csv_data:
	yield line.decode('utf-8').encode('utf-8')


	def readAddressData(input_file):

	data = {}
	with open(input_file) as f:
	reader = unicode_csv_reader(f, delimiter="\t")
	for row in reader:
	# pprint(row)
	# exit(1)
	data[row['ADDRESS_ID']] = row

	return data

	def readVoterData(input_file):

	data = {}
	with open(input_file) as f:
	reader = voters_csv_reader(f)
	for row in reader:
	data[row['VOTER_ID']] = row

	return data


	# ## Setup
	output_file = 'address_matching_output.csv'
	settings_file = 'address_matching_learned_settings'
	training_file = 'address_matching_training.json'
	address_file = 'addresses.txt'
	voter_file = 'PPABM201304R.csv'


	print 'importing voter data ...'
	messy_addresses = readVoterData(voter_file)
	print 'importing address data ...'
	canonical_addresses = readAddressData(address_file)

	# ## Training
	if os.path.exists(settings_file):
	print 'reading from', settings_file
	with open(settings_file) as sf :
	linker = dedupe.StaticGazetteer(sf, num_cores=2)

	else:
	# Define the fields dedupe will pay attention to
	fields = [
	{'field':'STREET_NAME', 'type':'String'},
	{'field':'STREET_TYPE', 'type':'Exact', 'has missing':True},
	{'field':'STREET_DIR_SUFFIX', 'type':'ShortString', 'has missing':True},
	{'field':'STREET_NUMBER', 'type':'Exact', 'has missing':True},
	{'field':'STREET_NUMBER_SUFFIX', 'type':'ShortString', 'has missing':True},
	{'field':'LOCALITY', 'type':'String'}
	]

	# Create a new linker object and pass our data model to it.
	linker = dedupe.Gazetteer(fields, num_cores=2)
	# To train dedupe, we feed it a random sample of records.
	linker.sample(messy_addresses, canonical_addresses, 3000)

	if os.path.exists(training_file):
	print 'reading labeled examples from ', training_file
	with open(training_file) as tf :
	linker.readTraining(tf)


	dedupe.consoleLabel(linker)
	linker.train()

	# When finished, save our training away to disk
	with open(training_file, 'w') as tf :
	linker.writeTraining(tf)

	# Save our weights and predicates to disk. If the settings file
	# exists, we will skip all the training and learning next time we run
	# this file.
	with open(settings_file, 'w') as sf :
	linker.writeSettings(sf)

	linker.cleanupTraining()

	print 'indexing'
	linker.index(canonical_addresses)

	clustered_dupes = []

	print 'clustering...'
	clustered_dupes = linker.match(messy_addresses, 0.0)

	print '# duplicate sets', len(clustered_dupes)
	print 'out of', len(messy_addresses)

	canonical_lookup = {}
	for n_results in clustered_dupes :
	(source_id, target_id), score = n_results[0]
	canonical_lookup[source_id] = (target_id, score)

	with open(output_file, 'w') as f:
	writer = csv.writer(f)
	writer.writerow(['address_id', 'voter_id',
	'Score', 'x_coord', 'y_coord'])

	for record_id, record in messy_addresses.items() :
	row = [record['VOTER_ID'], '', '', '', '']
	if record_id in canonical_lookup :
	canonical_id, score = canonical_lookup[record_id]
	row[1] = record['POSTAL_ADDRESS_1']
	row[2] = score
	row[3] = canonical_id
	row[4] = canonical_addresses[canonical_id]['FULL_ADDRESS']
	writer.writerow(row)
	STREET_NAME : 12TH
	STREET_TYPE : AVE
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 32921
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	STREET_NAME : 12TH
	STREET_TYPE : AVE
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 7828
	STREET_NUMBER_SUFFIX : None
	LOCALITY : BURNABY

	0/10 positive, 0/10 negative
	Do these records refer to the same thing?
	(y)es / (n)o / (u)nsure / (f)inished
	n
	STREET_NAME : CHERRY
	STREET_TYPE : AVE
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 33762
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	STREET_NAME : CHERRY
	STREET_TYPE : AVE
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 32598
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	0/10 positive, 1/10 negative
	Do these records refer to the same thing?
	(y)es / (n)o / (u)nsure / (f)inished
	n
	STREET_NAME : DEWDNEY TRUNK
	STREET_TYPE : RD
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 9131
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	STREET_NAME : HWY
	STREET_TYPE : HWY
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 9131
	STREET_NUMBER_SUFFIX : None
	LOCALITY : POWELL RIVER REGIONAL DISTRICT

	0/10 positive, 2/10 negative
	Do these records refer to the same thing?
	(y)es / (n)o / (u)nsure / (f)inished
	n
	STREET_NAME : LETHBRIDGE
	STREET_TYPE : DR
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 3788
	STREET_NUMBER_SUFFIX : None
	LOCALITY : ABBOTSFORD

	STREET_NAME : None
	STREET_TYPE : AVE
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 29457
	STREET_NUMBER_SUFFIX : None
	LOCALITY : ABBOTSFORD

	0/10 positive, 3/10 negative
	Do these records refer to the same thing?
	(y)es / (n)o / (u)nsure / (f)inished
	n
	STREET_NAME : HARMS
	STREET_TYPE : ST
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 8493
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	STREET_NAME : BAILEY
	STREET_TYPE : PL
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 8493
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	0/10 positive, 4/10 negative
	Do these records refer to the same thing?
	(y)es / (n)o / (u)nsure / (f)inished
	n
	STREET_NAME : 4TH
	STREET_TYPE : AVE
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 33150
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	STREET_NAME : 4TH
	STREET_TYPE : AVE
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 32866
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	0/10 positive, 5/10 negative
	Do these records refer to the same thing?
	(y)es / (n)o / (u)nsure / (f)inished
	n
	STREET_NAME : OLD YALE
	STREET_TYPE : RD
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 34909
	STREET_NUMBER_SUFFIX : None
	LOCALITY : ABBOTSFORD

	STREET_NAME : OLD YALE
	STREET_TYPE : RD
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 38691
	STREET_NUMBER_SUFFIX : None
	LOCALITY : ABBOTSFORD

	0/10 positive, 6/10 negative
	Do these records refer to the same thing?
	(y)es / (n)o / (u)nsure / (f)inished
	n
	STREET_NAME : 3RD
	STREET_TYPE : AVE
	STREET_DIR_SUFFIX : None
	STREET_NUMBER : 32904
	STREET_NUMBER_SUFFIX : None
	LOCALITY : MISSION

	... ... mOre...

	0/10 positive, 38/10 negative
	Do these records refer to the same thing?
	(y)es / (n)o / (u)nsure / (f)inished
	f
	Finished labeling
	Traceback (most recent call last):
	File "match_add_voters.py", line 147, in <module>
	linker.train()
	File "/Library/Python/2.7/site-packages/dedupe/api.py", line 735, in train
	index_predicates)
	File "/Library/Python/2.7/site-packages/dedupe/api.py", line 760, in _trainBlocker
	recall)
	File "/Library/Python/2.7/site-packages/dedupe/training.py", line 80, in learn
	raise ValueError(NO_PREDICATES_ERROR)
	ValueError: No predicate found! We could not learn a single good predicate. Maybe give Dedupe more training data or increasing the `max_comparisons` argument to the train method