fantods/eggcorns.py

## eggcorns.py
import csv

from nltk import tokenize

from er_core.er import ER
from er_core.preprocessors.normalizer import Normalizer

normalizer = Normalizer("default")

bad_string = "I hole-hardedly agree, but allow me to play doubles advocate here for a moment. For all intensive purposes I think you are wrong. In an age where false morals are a diamond dozen, true virtues are a blessing in the skies. We often put our false morality on a petal stool like a bunch of pre-Madonnas, but you all seem to be taking something very valuable for granite. So I ask of you to mustard up all the strength you can because it is a doggy dog world out there. Although there is some merit to what you are saying it seems like you have a huge ship on your shoulder. In your argument you seem to throw everything in but the kids Nsync, and even though you are having a feel day with this I am here to bring you back into reality. I have a sick sense when it comes to these types of things. It is almost spooky, because I cannot turn a blonde eye to these glaring flaws in your rhetoric. I have zero taller ants when it comes to people spouting out hate in the name of moral righteousness. You just need to remember what comes around is all around, and when supply and command fails you will be the first to go. It's clear who makes the pants in this relationship, and sometimes you just have to swallow your prize and accept the facts. You might have to come to this conclusion through denial and error but I swear on my mother's mating name that when you put the petal to the medal you will pass with flying carpets like it’s a peach of cake."


def fix_eggcorns(string):
    idiom_map = get_idiom_map()
    sentences = normalizer(tokenize.sent_tokenize(string))
    fixed_sentences = []
    for sent in sentences:
        sliding_words = list(sliding_window(sent, 4))
        matches = fix_sentence(sliding_words, idiom_map)
        for match in matches:
            correct_phrase = idiom_map[match.matching_value]
            sent = sent.replace(match.matching_value, correct_phrase)
        fixed_sentences.append(sent.capitalize())
    return fixed_sentences


def get_idiom_map():
    data = {}
    with open("idiom-map.csv", "rt") as f:
        reader = csv.reader(f)
        next(reader)
        for line in reader:
            line = normalizer(line)
            data[line[0]] = line[1]
    return data


def sliding_window(string, size):
    tokens = string.split(" ")
    for i in range(len(tokens) - size + 1):
        yield " ".join(tokens[i : i + size])


def fix_sentence(words, idiom_map):
    all_matches = []
    link = list(idiom_map.keys())
    for word in words:
        er = ER([word, *link], threshold=0.5).run()
        matches = {k: v for k, v in er.items() if v.similarity_score > 0.0}
        for key, val in matches.items():
            if val.cell_value == word:
                current_values = [x.matching_value for x in all_matches]
                if val.matching_value not in current_values:
                    all_matches.append(val)
    return all_matches


fixed_string = fix_eggcorns(bad_string)

print(f"Original Sentence:\n{bad_string}\n\n")
print(f"Fixed Sentence:\n{'. '.join(fixed_string)}")
	import csv

	from nltk import tokenize

	from er_core.er import ER
	from er_core.preprocessors.normalizer import Normalizer

	normalizer = Normalizer("default")

	bad_string = "I hole-hardedly agree, but allow me to play doubles advocate here for a moment. For all intensive purposes I think you are wrong. In an age where false morals are a diamond dozen, true virtues are a blessing in the skies. We often put our false morality on a petal stool like a bunch of pre-Madonnas, but you all seem to be taking something very valuable for granite. So I ask of you to mustard up all the strength you can because it is a doggy dog world out there. Although there is some merit to what you are saying it seems like you have a huge ship on your shoulder. In your argument you seem to throw everything in but the kids Nsync, and even though you are having a feel day with this I am here to bring you back into reality. I have a sick sense when it comes to these types of things. It is almost spooky, because I cannot turn a blonde eye to these glaring flaws in your rhetoric. I have zero taller ants when it comes to people spouting out hate in the name of moral righteousness. You just need to remember what comes around is all around, and when supply and command fails you will be the first to go. It's clear who makes the pants in this relationship, and sometimes you just have to swallow your prize and accept the facts. You might have to come to this conclusion through denial and error but I swear on my mother's mating name that when you put the petal to the medal you will pass with flying carpets like it’s a peach of cake."


	def fix_eggcorns(string):
	idiom_map = get_idiom_map()
	sentences = normalizer(tokenize.sent_tokenize(string))
	fixed_sentences = []
	for sent in sentences:
	sliding_words = list(sliding_window(sent, 4))
	matches = fix_sentence(sliding_words, idiom_map)
	for match in matches:
	correct_phrase = idiom_map[match.matching_value]
	sent = sent.replace(match.matching_value, correct_phrase)
	fixed_sentences.append(sent.capitalize())
	return fixed_sentences


	def get_idiom_map():
	data = {}
	with open("idiom-map.csv", "rt") as f:
	reader = csv.reader(f)
	next(reader)
	for line in reader:
	line = normalizer(line)
	data[line[0]] = line[1]
	return data


	def sliding_window(string, size):
	tokens = string.split(" ")
	for i in range(len(tokens) - size + 1):
	yield " ".join(tokens[i : i + size])


	def fix_sentence(words, idiom_map):
	all_matches = []
	link = list(idiom_map.keys())
	for word in words:
	er = ER([word, *link], threshold=0.5).run()
	matches = {k: v for k, v in er.items() if v.similarity_score > 0.0}
	for key, val in matches.items():
	if val.cell_value == word:
	current_values = [x.matching_value for x in all_matches]
	if val.matching_value not in current_values:
	all_matches.append(val)
	return all_matches


	fixed_string = fix_eggcorns(bad_string)

	print(f"Original Sentence:\n{bad_string}\n\n")
	print(f"Fixed Sentence:\n{'. '.join(fixed_string)}")