cwurld/gist:bbd138c953b416644ed66901d1e9196b

## gistfile1.txt
# Gazetteer ---------------------------------------------------------------------------------------------------------
def load_deduped_output_for_gazetteer(filename):
    """
    Parse data from dedupe into:

        1. Canonical dataset (no dupes)
        2. messy data - all the dupes
        3. markPairs.

    :return:
    """
    with open(filename) as f_input:
        reader = csv.DictReader(f_input)
        field_names = get_dedupe_field_names()  # a function used by Dedupe and Gazetteer

        # For markPairs
        dups = {}  # dups[cluster_id] = [a record from the cluster, another record from the cluster]
        singletons = []

        # For Gazetteer.sample()
        data_d = {}
        messy_d = {}

        clusters_used = set([])
        # Parse results from output from dedupe.
        for row in reader:
            cluster_id = row['Cluster ID']
            row_id = int(row['id'])

            # The data is already clean, replace blank fields with None
            clean_row = dict([(k, row[k] or None) for k in field_names])

            # Put one item from each cluster into data_d, the rest go into messy_d
            if cluster_id in clusters_used:
                messy_d[row_id] = dict(clean_row)
            else:
                clusters_used.add(cluster_id)
                data_d[row_id] = dict(clean_row)

            # Singletons do not have a confidence score.
            if row['confidence_score']:
                if cluster_id not in dups:
                    dups[cluster_id] = []

                if len(dups[cluster_id]) < 2:
                    dups[cluster_id].append(clean_row)
            else:
                singletons.append(clean_row)

        # markPairs wants this dict
        labeled_examples = {'match': [], 'distinct': []}

        # Load matches ------------------------------------------------------------------------------------------
        for k, v in six.iteritems(dups):
            labeled_examples['match'].append((v[0], v[1]))

        # Make a set of distinct pairs --------------------------------------------------------------------------
        # Make a shuffled set of indices into singletons
        ss = list(range(len(singletons)))
        random.shuffle(ss)  # shuffle is in-place

        n_singletons = 50 * len(dups)
        while True:
            try:
                i1 = ss.pop()
            except IndexError:
                break

            try:
                i2 = ss.pop()
            except IndexError:
                break

            labeled_examples['distinct'].append((singletons[i1], singletons[i2]))

            if len(labeled_examples['distinct']) > n_singletons:
                break

    return data_d, messy_d, labeled_examples


# https://www.snip2code.com/Snippet/460447/address_matcher-py
def my_gazetteer():
    filename = 'dedupe_params/test_output.csv'  # file after dedupe

    g_settings_file = 'dedupe_params/g_learned_settings.dat'
    g_training_file = 'dedupe_params/g_training.json'

    data_d, messy_d, labeled_examples = load_deduped_output_for_gazetteer(filename)

    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(g_settings_file):
        print('reading from', g_settings_file)
        with open(settings_file, 'rb') as f:
            linker = dedupe.StaticGazetteer(f)
    else:
        fields = get_dedupe_fields()
        linker = dedupe.Gazetteer(fields)
        linker.sample(messy_d, data_d, 3000)

        # Since we are using markPairs, we do not need to do manual matching with dedupe.consoleLabel(deduper)
        linker.markPairs(labeled_examples)
        linker.train()

        with open(g_training_file, 'w') as tf:
            linker.writeTraining(tf)

        with open(g_settings_file, 'wb') as sf:
            linker.writeSettings(sf)

        linker.cleanupTraining()

    linker.index(data_d)
    threshold = linker.threshold(data_d)
    clustered_dupes = linker.match(messy_d, threshold=threshold)

    print('Found {} duplicate sets out of {}'.format(len(clustered_dupes), len(messy_d)))

    test = {1234: {'id': 1234,  'first_name': 'Groucho', 'last_name': 'Marx', 'account_name': 'ROI Park',
                   'email1': 'g@roipark.com',
                   'phone_work': '', 'phone_mobile': '', 'primary_address_state': 'WI'}}
    c_test = {1234: clean_record(test[1234])}

    # If the possible dup is very different from the training set, it can generate a ValueError. It turns out this
    # can happen even for records that look similar to the training set.
    try:
        matches = linker.match(c_test, threshold)
    except ValueError:
        return []

    if matches:
        print('{} has matches'.format(test))
    else:
        print('{} does not match any record'.format(test))

if __name__ == '__main__':
    my_gazetteer()
	# Gazetteer ---------------------------------------------------------------------------------------------------------
	def load_deduped_output_for_gazetteer(filename):
	"""
	Parse data from dedupe into:

	1. Canonical dataset (no dupes)
	2. messy data - all the dupes
	3. markPairs.

	:return:
	"""
	with open(filename) as f_input:
	reader = csv.DictReader(f_input)
	field_names = get_dedupe_field_names() # a function used by Dedupe and Gazetteer

	# For markPairs
	dups = {} # dups[cluster_id] = [a record from the cluster, another record from the cluster]
	singletons = []

	# For Gazetteer.sample()
	data_d = {}
	messy_d = {}

	clusters_used = set([])
	# Parse results from output from dedupe.
	for row in reader:
	cluster_id = row['Cluster ID']
	row_id = int(row['id'])

	# The data is already clean, replace blank fields with None
	clean_row = dict([(k, row[k] or None) for k in field_names])

	# Put one item from each cluster into data_d, the rest go into messy_d
	if cluster_id in clusters_used:
	messy_d[row_id] = dict(clean_row)
	else:
	clusters_used.add(cluster_id)
	data_d[row_id] = dict(clean_row)

	# Singletons do not have a confidence score.
	if row['confidence_score']:
	if cluster_id not in dups:
	dups[cluster_id] = []

	if len(dups[cluster_id]) < 2:
	dups[cluster_id].append(clean_row)
	else:
	singletons.append(clean_row)

	# markPairs wants this dict
	labeled_examples = {'match': [], 'distinct': []}

	# Load matches ------------------------------------------------------------------------------------------
	for k, v in six.iteritems(dups):
	labeled_examples['match'].append((v[0], v[1]))

	# Make a set of distinct pairs --------------------------------------------------------------------------
	# Make a shuffled set of indices into singletons
	ss = list(range(len(singletons)))
	random.shuffle(ss) # shuffle is in-place

	n_singletons = 50 * len(dups)
	while True:
	try:
	i1 = ss.pop()
	except IndexError:
	break

	try:
	i2 = ss.pop()
	except IndexError:
	break

	labeled_examples['distinct'].append((singletons[i1], singletons[i2]))

	if len(labeled_examples['distinct']) > n_singletons:
	break

	return data_d, messy_d, labeled_examples


	# https://www.snip2code.com/Snippet/460447/address_matcher-py
	def my_gazetteer():
	filename = 'dedupe_params/test_output.csv' # file after dedupe

	g_settings_file = 'dedupe_params/g_learned_settings.dat'
	g_training_file = 'dedupe_params/g_training.json'

	data_d, messy_d, labeled_examples = load_deduped_output_for_gazetteer(filename)

	# If a settings file already exists, we'll just load that and skip training
	if os.path.exists(g_settings_file):
	print('reading from', g_settings_file)
	with open(settings_file, 'rb') as f:
	linker = dedupe.StaticGazetteer(f)
	else:
	fields = get_dedupe_fields()
	linker = dedupe.Gazetteer(fields)
	linker.sample(messy_d, data_d, 3000)

	# Since we are using markPairs, we do not need to do manual matching with dedupe.consoleLabel(deduper)
	linker.markPairs(labeled_examples)
	linker.train()

	with open(g_training_file, 'w') as tf:
	linker.writeTraining(tf)

	with open(g_settings_file, 'wb') as sf:
	linker.writeSettings(sf)

	linker.cleanupTraining()

	linker.index(data_d)
	threshold = linker.threshold(data_d)
	clustered_dupes = linker.match(messy_d, threshold=threshold)

	print('Found {} duplicate sets out of {}'.format(len(clustered_dupes), len(messy_d)))

	test = {1234: {'id': 1234, 'first_name': 'Groucho', 'last_name': 'Marx', 'account_name': 'ROI Park',
	'email1': 'g@roipark.com',
	'phone_work': '', 'phone_mobile': '', 'primary_address_state': 'WI'}}
	c_test = {1234: clean_record(test[1234])}

	# If the possible dup is very different from the training set, it can generate a ValueError. It turns out this
	# can happen even for records that look similar to the training set.
	try:
	matches = linker.match(c_test, threshold)
	except ValueError:
	return []

	if matches:
	print('{} has matches'.format(test))
	else:
	print('{} does not match any record'.format(test))

	if __name__ == '__main__':
	my_gazetteer()