fgregg/near_dupe_functions.py

## near_dupe_functions.py
def recordDistances(candidates, data_d, data_model):

  # The record array has two elements, the first element is an array
  # of floats that has length equal the number of fields. The second
  # argument is a array of length 2 which stores the id of the
  # considered elements in the pair.

    fields = data_model['fields']

    field_dtype = [('names', 'a20', len(fields)), ('values', 'f4',
                   len(fields))]

    record_dtype = [('pairs', [('pair1', 'i4'), ('pair2', 'i4')]),
                    ('field_distances', field_dtype)]

    distances = numpy.zeros(1, dtype=field_dtype)

    record_distances = numpy.zeros(len(candidates), dtype=record_dtype)

    for (i, pair) in enumerate(candidates):

        c_distances = calculateDistance(data_d[pair[0]],
                                        data_d[pair[1]],
                                        fields,
                                        distances)

        record_distances[i] = ((pair[0], pair[1]),
                               (c_distances['names'],
                                c_distances['values']))

    return record_distances

def recordDistancesII(candidates, data_model):

  # The record array has two elements, the first element is an array
  # of floats that has length equal the number of fields. The second
  # argument is a array of length 2 which stores the id of the
  # considered elements in the pair.

    fields = data_model['fields']

    field_dtype = [('names', 'a20', len(fields)), ('values', 'f4',
                   len(fields))]

    record_dtype = [('pairs', [('pair1', 'i4'), ('pair2', 'i4')]),
                    ('field_distances', field_dtype)]

    distances = numpy.zeros(1, dtype=field_dtype)

    record_distances = numpy.zeros(len(candidates), dtype=record_dtype)

    for (i, pair) in enumerate(candidates):
        instance_1, instance_2 = pair
        key_1, record_1 = instance_1
        key_2, record_2 = instance_2

        c_distances = calculateDistance(record_1,
                                        record_2,
                                        fields,
                                        distances)

        record_distances[i] = ((key_1, key_2),
                               (c_distances['names'],
                                c_distances['values']))

    return record_distances

# appends training data to the training data collection

def addTrainingData(labeled_pairs, data_model, training_data=[]):

    fields = data_model['fields']


    field_dtype = training_data.dtype[1]

    distances = numpy.zeros(1, dtype=field_dtype)

    num_training_pairs = len(labeled_pairs[0]) + len(labeled_pairs[1])

    new_training_data = numpy.zeros(num_training_pairs,
                                    dtype=training_data.dtype)

    i = 0
    for (label, examples) in labeled_pairs.items():
        for pair in examples:
            c_distances = core.calculateDistance(pair[0],
                                                 pair[1],
                                                 fields,
                                                 distances)

            example = (label, c_distances)
            new_training_data[i] = example
            i += 1

    training_data = numpy.append(training_data, new_training_data)

    return training_data
	def recordDistances(candidates, data_d, data_model):

	# The record array has two elements, the first element is an array
	# of floats that has length equal the number of fields. The second
	# argument is a array of length 2 which stores the id of the
	# considered elements in the pair.

	fields = data_model['fields']

	field_dtype = [('names', 'a20', len(fields)), ('values', 'f4',
	len(fields))]

	record_dtype = [('pairs', [('pair1', 'i4'), ('pair2', 'i4')]),
	('field_distances', field_dtype)]

	distances = numpy.zeros(1, dtype=field_dtype)

	record_distances = numpy.zeros(len(candidates), dtype=record_dtype)

	for (i, pair) in enumerate(candidates):

	c_distances = calculateDistance(data_d[pair[0]],
	data_d[pair[1]],
	fields,
	distances)

	record_distances[i] = ((pair[0], pair[1]),
	(c_distances['names'],
	c_distances['values']))

	return record_distances

	def recordDistancesII(candidates, data_model):

	# The record array has two elements, the first element is an array
	# of floats that has length equal the number of fields. The second
	# argument is a array of length 2 which stores the id of the
	# considered elements in the pair.

	fields = data_model['fields']

	field_dtype = [('names', 'a20', len(fields)), ('values', 'f4',
	len(fields))]

	record_dtype = [('pairs', [('pair1', 'i4'), ('pair2', 'i4')]),
	('field_distances', field_dtype)]

	distances = numpy.zeros(1, dtype=field_dtype)

	record_distances = numpy.zeros(len(candidates), dtype=record_dtype)

	for (i, pair) in enumerate(candidates):
	instance_1, instance_2 = pair
	key_1, record_1 = instance_1
	key_2, record_2 = instance_2

	c_distances = calculateDistance(record_1,
	record_2,
	fields,
	distances)

	record_distances[i] = ((key_1, key_2),
	(c_distances['names'],
	c_distances['values']))

	return record_distances

	# appends training data to the training data collection

	def addTrainingData(labeled_pairs, data_model, training_data=[]):

	fields = data_model['fields']


	field_dtype = training_data.dtype[1]

	distances = numpy.zeros(1, dtype=field_dtype)

	num_training_pairs = len(labeled_pairs[0]) + len(labeled_pairs[1])

	new_training_data = numpy.zeros(num_training_pairs,
	dtype=training_data.dtype)

	i = 0
	for (label, examples) in labeled_pairs.items():
	for pair in examples:
	c_distances = core.calculateDistance(pair[0],
	pair[1],
	fields,
	distances)

	example = (label, c_distances)
	new_training_data[i] = example
	i += 1

	training_data = numpy.append(training_data, new_training_data)

	return training_data