Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Count core words in a document
# Spacy is a natural language processing tool to aid with quickly building apps. We can use it, meaning we will have
# to do minimal work on our own
import spacy
if __name__ == '__main__':
# Load the poem from my computer. The text file contains just the content of the poem as is, pasted in with Notepad.
file = open("C:\\Users\\aanand\\Downloads\\How To Tame a Wild Tongue.txt", "r")
poem = file.read()
file.close()
nlp = spacy.load("en_core_web_sm")
# trigger analysis of the poem
doc = nlp(poem)
# will map each word to the number of times it appears in the poem
lemmas = dict()
for token in doc:
# token.is_stop will remove words like "and", and "the" from the document
# token.text.strip will remove empty spaces and newlines
# token.pos (part of speech) will remove punctuation and miscellaneous symbols from the document
if token.text.strip() and not token.is_stop and token.pos_ != "PUNCT" and token.pos != "X":
# lemma is the root of the word, e.g running, runs, ran will all have a lemma of "run"
lemma: str = token.lemma_.lower()
# this updates and fills the lemmas map we made earlier
lemmas[lemma] = lemmas.get(lemma, 0) + 1
# sort the lemmas in descending order
sorted_lemmas = sorted(lemmas.items(), key=lambda item: item[1], reverse=True)
# print out the top 40 lemmas in the poem
for (lemma, n) in sorted_lemmas[:40]:
print(f'{lemma} x {n}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment