TomPiona/standardizing matching strings

## standardizing matching strings
import pandas as pd
import numpy as np
from itertools import combinations


def count_occurances(l):
    return [[x, l.count(x)] for x in set(l)]

def word_pairs(l):
    return list(combinations(set(l), 2))

def calc_distance(l, dist_type):
    return [dist_type(first, second) for first, second in word_pairs(l)]

def small_distance(distances, tol):
    return [dist < tol for dist in distances]

def get_close_pairs(l, tol, dist_type):
    pairs = word_pairs(l)
    distances = calc_distance(pairs, dist_type)
    return [pairs[i] for i in range(len(pairs)) if distances[i] < tol]

def which_to_which(close_pairs):
    change_dict = {}
    for first, second in close_pairs:
        inp = input('[{0}] and [{1}] are close values.\nenter 1 to change all [{0}] to [{1}]\nenter 2 to change all [{1}] to [{0}]\nenter a custom value to change both\njust click enter to change nothing: '.format(first, second))
        if inp == '1':
            change_dict[first] = second
        elif inp == '2':
            change_dict[second] = first
        elif inp != '':
            change_dict[first] = inp
            change_dict[second] = inp
    return change_dict

def replace_in_list(change_dict, l):
    return [change_dict[val] if val in change_dict else val for val in l]

def levenshtein(source, target):
    """
    this function is take from Wikipedia:
    https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
    """
    if len(source) < len(target):
        return levenshtein(target, source)

    # So now we have len(source) >= len(target).
    if len(target) == 0:
        return len(source)

    # We call tuple() to force strings to be used as sequences
    # ('c', 'a', 't', 's') - numpy uses them as values by default.
    source = np.array(tuple(source))
    target = np.array(tuple(target))

    # We use a dynamic programming algorithm, but with the
    # added optimization that we only need the last two rows
    # of the matrix.
    previous_row = np.arange(target.size + 1)
    for s in source:
        # Insertion (target grows longer than source):
        current_row = previous_row + 1

        # Substitution or matching:
        # Target and source items are aligned, and either
        # are different (cost of 1), or are the same (cost of 0).
        current_row[1:] = np.minimum(
                current_row[1:],
                np.add(previous_row[:-1], target != s))

        # Deletion (target grows shorter than source):
        current_row[1:] = np.minimum(
                current_row[1:],
                current_row[0:-1] + 1)

        previous_row = current_row

    return previous_row[-1]

def main(l, tol=3, dist_type=levenshtein):
    close_pairs = get_close_pairs(l, tol, dist_type)
    change_dict = which_to_which(close_pairs)
    return replace_in_list(change_dict)
	import pandas as pd
	import numpy as np
	from itertools import combinations


	def count_occurances(l):
	return [[x, l.count(x)] for x in set(l)]

	def word_pairs(l):
	return list(combinations(set(l), 2))

	def calc_distance(l, dist_type):
	return [dist_type(first, second) for first, second in word_pairs(l)]

	def small_distance(distances, tol):
	return [dist < tol for dist in distances]

	def get_close_pairs(l, tol, dist_type):
	pairs = word_pairs(l)
	distances = calc_distance(pairs, dist_type)
	return [pairs[i] for i in range(len(pairs)) if distances[i] < tol]

	def which_to_which(close_pairs):
	change_dict = {}
	for first, second in close_pairs:
	inp = input('[{0}] and [{1}] are close values.\nenter 1 to change all [{0}] to [{1}]\nenter 2 to change all [{1}] to [{0}]\nenter a custom value to change both\njust click enter to change nothing: '.format(first, second))
	if inp == '1':
	change_dict[first] = second
	elif inp == '2':
	change_dict[second] = first
	elif inp != '':
	change_dict[first] = inp
	change_dict[second] = inp
	return change_dict

	def replace_in_list(change_dict, l):
	return [change_dict[val] if val in change_dict else val for val in l]

	def levenshtein(source, target):
	"""
	this function is take from Wikipedia:
	https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
	"""
	if len(source) < len(target):
	return levenshtein(target, source)

	# So now we have len(source) >= len(target).
	if len(target) == 0:
	return len(source)

	# We call tuple() to force strings to be used as sequences
	# ('c', 'a', 't', 's') - numpy uses them as values by default.
	source = np.array(tuple(source))
	target = np.array(tuple(target))

	# We use a dynamic programming algorithm, but with the
	# added optimization that we only need the last two rows
	# of the matrix.
	previous_row = np.arange(target.size + 1)
	for s in source:
	# Insertion (target grows longer than source):
	current_row = previous_row + 1

	# Substitution or matching:
	# Target and source items are aligned, and either
	# are different (cost of 1), or are the same (cost of 0).
	current_row[1:] = np.minimum(
	current_row[1:],
	np.add(previous_row[:-1], target != s))

	# Deletion (target grows shorter than source):
	current_row[1:] = np.minimum(
	current_row[1:],
	current_row[0:-1] + 1)

	previous_row = current_row

	return previous_row[-1]

	def main(l, tol=3, dist_type=levenshtein):
	close_pairs = get_close_pairs(l, tol, dist_type)
	change_dict = which_to_which(close_pairs)
	return replace_in_list(change_dict)