Created
November 12, 2020 13:11
-
-
Save akainth015/9e94f76127bb4a9a0849855119a4e97d to your computer and use it in GitHub Desktop.
Count core words in a document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Spacy is a natural language processing tool to aid with quickly building apps. We can use it, meaning we will have | |
# to do minimal work on our own | |
import spacy | |
if __name__ == '__main__': | |
# Load the poem from my computer. The text file contains just the content of the poem as is, pasted in with Notepad. | |
file = open("C:\\Users\\aanand\\Downloads\\How To Tame a Wild Tongue.txt", "r") | |
poem = file.read() | |
file.close() | |
nlp = spacy.load("en_core_web_sm") | |
# trigger analysis of the poem | |
doc = nlp(poem) | |
# will map each word to the number of times it appears in the poem | |
lemmas = dict() | |
for token in doc: | |
# token.is_stop will remove words like "and", and "the" from the document | |
# token.text.strip will remove empty spaces and newlines | |
# token.pos (part of speech) will remove punctuation and miscellaneous symbols from the document | |
if token.text.strip() and not token.is_stop and token.pos_ != "PUNCT" and token.pos != "X": | |
# lemma is the root of the word, e.g running, runs, ran will all have a lemma of "run" | |
lemma: str = token.lemma_.lower() | |
# this updates and fills the lemmas map we made earlier | |
lemmas[lemma] = lemmas.get(lemma, 0) + 1 | |
# sort the lemmas in descending order | |
sorted_lemmas = sorted(lemmas.items(), key=lambda item: item[1], reverse=True) | |
# print out the top 40 lemmas in the poem | |
for (lemma, n) in sorted_lemmas[:40]: | |
print(f'{lemma} x {n}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment