Created
February 26, 2023 07:34
-
-
Save originalankur/a1c2d5b282ca535022916664476f27c6 to your computer and use it in GitHub Desktop.
tokenise the text and sort it. Using it for ensuring that son can spell and write all the words.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import contextualSpellCheck | |
import spacy | |
def filter_words(text): | |
text = text.strip() | |
if text in nlp.Defaults.stop_words: | |
print("in stop word", text) | |
return False | |
elif len(text) <= 2: | |
print("length is <= 2", text) | |
return False | |
return True | |
if __name__ == "__main__": | |
print(sys.argv) | |
if len(sys.argv) != 2: | |
print("Error: Usage python spell_and_write.py chapter_text.txt") | |
sys.exit(0) | |
# read text | |
filename = sys.argv[1] | |
f_handler = open(filename, "r") | |
content = f_handler.read() | |
f_handler.close() | |
# tokenize it | |
nlp = spacy.load("en_core_web_sm") | |
contextualSpellCheck.add_to_pipe(nlp) | |
doc = nlp(content) | |
print(doc._.performed_spellCheck) | |
print(doc._.outcome_spellCheck) | |
# remove stop words | |
words = [token.text.lower().strip() for token in doc if filter_words(token.text.lower())] | |
words = list(set(words)) | |
words.sort() | |
output = open("output_{}".format(filename), "w") | |
output.write("\n".join(words)) | |
output.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment