Created
February 25, 2018 05:38
-
-
Save TomPiona/7aa58560423db9d7d14cffa9fce42218 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from itertools import combinations | |
def count_occurances(l): | |
return [[x, l.count(x)] for x in set(l)] | |
def word_pairs(l): | |
return list(combinations(set(l), 2)) | |
def calc_distance(l, dist_type): | |
return [dist_type(first, second) for first, second in word_pairs(l)] | |
def small_distance(distances, tol): | |
return [dist < tol for dist in distances] | |
def get_close_pairs(l, tol, dist_type): | |
pairs = word_pairs(l) | |
distances = calc_distance(pairs, dist_type) | |
return [pairs[i] for i in range(len(pairs)) if distances[i] < tol] | |
def which_to_which(close_pairs): | |
change_dict = {} | |
for first, second in close_pairs: | |
inp = input('[{0}] and [{1}] are close values.\nenter 1 to change all [{0}] to [{1}]\nenter 2 to change all [{1}] to [{0}]\nenter a custom value to change both\njust click enter to change nothing: '.format(first, second)) | |
if inp == '1': | |
change_dict[first] = second | |
elif inp == '2': | |
change_dict[second] = first | |
elif inp != '': | |
change_dict[first] = inp | |
change_dict[second] = inp | |
return change_dict | |
def replace_in_list(change_dict, l): | |
return [change_dict[val] if val in change_dict else val for val in l] | |
def levenshtein(source, target): | |
""" | |
this function is take from Wikipedia: | |
https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python | |
""" | |
if len(source) < len(target): | |
return levenshtein(target, source) | |
# So now we have len(source) >= len(target). | |
if len(target) == 0: | |
return len(source) | |
# We call tuple() to force strings to be used as sequences | |
# ('c', 'a', 't', 's') - numpy uses them as values by default. | |
source = np.array(tuple(source)) | |
target = np.array(tuple(target)) | |
# We use a dynamic programming algorithm, but with the | |
# added optimization that we only need the last two rows | |
# of the matrix. | |
previous_row = np.arange(target.size + 1) | |
for s in source: | |
# Insertion (target grows longer than source): | |
current_row = previous_row + 1 | |
# Substitution or matching: | |
# Target and source items are aligned, and either | |
# are different (cost of 1), or are the same (cost of 0). | |
current_row[1:] = np.minimum( | |
current_row[1:], | |
np.add(previous_row[:-1], target != s)) | |
# Deletion (target grows shorter than source): | |
current_row[1:] = np.minimum( | |
current_row[1:], | |
current_row[0:-1] + 1) | |
previous_row = current_row | |
return previous_row[-1] | |
def main(l, tol=3, dist_type=levenshtein): | |
close_pairs = get_close_pairs(l, tol, dist_type) | |
change_dict = which_to_which(close_pairs) | |
return replace_in_list(change_dict) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment