Skip to content

Instantly share code, notes, and snippets.

@cwurld
Last active July 18, 2017 10:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cwurld/bbd138c953b416644ed66901d1e9196b to your computer and use it in GitHub Desktop.
Save cwurld/bbd138c953b416644ed66901d1e9196b to your computer and use it in GitHub Desktop.
Example of Python Dedupe Gazetteer
# Gazetteer ---------------------------------------------------------------------------------------------------------
def load_deduped_output_for_gazetteer(filename):
"""
Parse data from dedupe into:
1. Canonical dataset (no dupes)
2. messy data - all the dupes
3. markPairs.
:return:
"""
with open(filename) as f_input:
reader = csv.DictReader(f_input)
field_names = get_dedupe_field_names() # a function used by Dedupe and Gazetteer
# For markPairs
dups = {} # dups[cluster_id] = [a record from the cluster, another record from the cluster]
singletons = []
# For Gazetteer.sample()
data_d = {}
messy_d = {}
clusters_used = set([])
# Parse results from output from dedupe.
for row in reader:
cluster_id = row['Cluster ID']
row_id = int(row['id'])
# The data is already clean, replace blank fields with None
clean_row = dict([(k, row[k] or None) for k in field_names])
# Put one item from each cluster into data_d, the rest go into messy_d
if cluster_id in clusters_used:
messy_d[row_id] = dict(clean_row)
else:
clusters_used.add(cluster_id)
data_d[row_id] = dict(clean_row)
# Singletons do not have a confidence score.
if row['confidence_score']:
if cluster_id not in dups:
dups[cluster_id] = []
if len(dups[cluster_id]) < 2:
dups[cluster_id].append(clean_row)
else:
singletons.append(clean_row)
# markPairs wants this dict
labeled_examples = {'match': [], 'distinct': []}
# Load matches ------------------------------------------------------------------------------------------
for k, v in six.iteritems(dups):
labeled_examples['match'].append((v[0], v[1]))
# Make a set of distinct pairs --------------------------------------------------------------------------
# Make a shuffled set of indices into singletons
ss = list(range(len(singletons)))
random.shuffle(ss) # shuffle is in-place
n_singletons = 50 * len(dups)
while True:
try:
i1 = ss.pop()
except IndexError:
break
try:
i2 = ss.pop()
except IndexError:
break
labeled_examples['distinct'].append((singletons[i1], singletons[i2]))
if len(labeled_examples['distinct']) > n_singletons:
break
return data_d, messy_d, labeled_examples
# https://www.snip2code.com/Snippet/460447/address_matcher-py
def my_gazetteer():
filename = 'dedupe_params/test_output.csv' # file after dedupe
g_settings_file = 'dedupe_params/g_learned_settings.dat'
g_training_file = 'dedupe_params/g_training.json'
data_d, messy_d, labeled_examples = load_deduped_output_for_gazetteer(filename)
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(g_settings_file):
print('reading from', g_settings_file)
with open(settings_file, 'rb') as f:
linker = dedupe.StaticGazetteer(f)
else:
fields = get_dedupe_fields()
linker = dedupe.Gazetteer(fields)
linker.sample(messy_d, data_d, 3000)
# Since we are using markPairs, we do not need to do manual matching with dedupe.consoleLabel(deduper)
linker.markPairs(labeled_examples)
linker.train()
with open(g_training_file, 'w') as tf:
linker.writeTraining(tf)
with open(g_settings_file, 'wb') as sf:
linker.writeSettings(sf)
linker.cleanupTraining()
linker.index(data_d)
threshold = linker.threshold(data_d)
clustered_dupes = linker.match(messy_d, threshold=threshold)
print('Found {} duplicate sets out of {}'.format(len(clustered_dupes), len(messy_d)))
test = {1234: {'id': 1234, 'first_name': 'Groucho', 'last_name': 'Marx', 'account_name': 'ROI Park',
'email1': 'g@roipark.com',
'phone_work': '', 'phone_mobile': '', 'primary_address_state': 'WI'}}
c_test = {1234: clean_record(test[1234])}
# If the possible dup is very different from the training set, it can generate a ValueError. It turns out this
# can happen even for records that look similar to the training set.
try:
matches = linker.match(c_test, threshold)
except ValueError:
return []
if matches:
print('{} has matches'.format(test))
else:
print('{} does not match any record'.format(test))
if __name__ == '__main__':
my_gazetteer()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment