Skip to content

Instantly share code, notes, and snippets.

@originalankur
Created February 26, 2023 07:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save originalankur/a1c2d5b282ca535022916664476f27c6 to your computer and use it in GitHub Desktop.
Save originalankur/a1c2d5b282ca535022916664476f27c6 to your computer and use it in GitHub Desktop.
tokenise the text and sort it. Using it for ensuring that son can spell and write all the words.
import sys
import contextualSpellCheck
import spacy
def filter_words(text):
text = text.strip()
if text in nlp.Defaults.stop_words:
print("in stop word", text)
return False
elif len(text) <= 2:
print("length is <= 2", text)
return False
return True
if __name__ == "__main__":
print(sys.argv)
if len(sys.argv) != 2:
print("Error: Usage python spell_and_write.py chapter_text.txt")
sys.exit(0)
# read text
filename = sys.argv[1]
f_handler = open(filename, "r")
content = f_handler.read()
f_handler.close()
# tokenize it
nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)
doc = nlp(content)
print(doc._.performed_spellCheck)
print(doc._.outcome_spellCheck)
# remove stop words
words = [token.text.lower().strip() for token in doc if filter_words(token.text.lower())]
words = list(set(words))
words.sort()
output = open("output_{}".format(filename), "w")
output.write("\n".join(words))
output.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment