Skip to content

Instantly share code, notes, and snippets.

@TomPiona
Created February 25, 2018 05:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TomPiona/7aa58560423db9d7d14cffa9fce42218 to your computer and use it in GitHub Desktop.
Save TomPiona/7aa58560423db9d7d14cffa9fce42218 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
from itertools import combinations
def count_occurances(l):
return [[x, l.count(x)] for x in set(l)]
def word_pairs(l):
return list(combinations(set(l), 2))
def calc_distance(l, dist_type):
return [dist_type(first, second) for first, second in word_pairs(l)]
def small_distance(distances, tol):
return [dist < tol for dist in distances]
def get_close_pairs(l, tol, dist_type):
pairs = word_pairs(l)
distances = calc_distance(pairs, dist_type)
return [pairs[i] for i in range(len(pairs)) if distances[i] < tol]
def which_to_which(close_pairs):
change_dict = {}
for first, second in close_pairs:
inp = input('[{0}] and [{1}] are close values.\nenter 1 to change all [{0}] to [{1}]\nenter 2 to change all [{1}] to [{0}]\nenter a custom value to change both\njust click enter to change nothing: '.format(first, second))
if inp == '1':
change_dict[first] = second
elif inp == '2':
change_dict[second] = first
elif inp != '':
change_dict[first] = inp
change_dict[second] = inp
return change_dict
def replace_in_list(change_dict, l):
return [change_dict[val] if val in change_dict else val for val in l]
def levenshtein(source, target):
"""
this function is take from Wikipedia:
https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
"""
if len(source) < len(target):
return levenshtein(target, source)
# So now we have len(source) >= len(target).
if len(target) == 0:
return len(source)
# We call tuple() to force strings to be used as sequences
# ('c', 'a', 't', 's') - numpy uses them as values by default.
source = np.array(tuple(source))
target = np.array(tuple(target))
# We use a dynamic programming algorithm, but with the
# added optimization that we only need the last two rows
# of the matrix.
previous_row = np.arange(target.size + 1)
for s in source:
# Insertion (target grows longer than source):
current_row = previous_row + 1
# Substitution or matching:
# Target and source items are aligned, and either
# are different (cost of 1), or are the same (cost of 0).
current_row[1:] = np.minimum(
current_row[1:],
np.add(previous_row[:-1], target != s))
# Deletion (target grows shorter than source):
current_row[1:] = np.minimum(
current_row[1:],
current_row[0:-1] + 1)
previous_row = current_row
return previous_row[-1]
def main(l, tol=3, dist_type=levenshtein):
close_pairs = get_close_pairs(l, tol, dist_type)
change_dict = which_to_which(close_pairs)
return replace_in_list(change_dict)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment