akainth015/lemma_counting.py

## lemma_counting.py
# Spacy is a natural language processing tool to aid with quickly building apps. We can use it, meaning we will have
# to do minimal work on our own
import spacy

if __name__ == '__main__':
    # Load the poem from my computer. The text file contains just the content of the poem as is, pasted in with Notepad.
    file = open("C:\\Users\\aanand\\Downloads\\How To Tame a Wild Tongue.txt", "r")
    poem = file.read()
    file.close()

    nlp = spacy.load("en_core_web_sm")
    # trigger analysis of the poem
    doc = nlp(poem)

    # will map each word to the number of times it appears in the poem
    lemmas = dict()

    for token in doc:
        # token.is_stop will remove words like "and", and "the" from the document
        # token.text.strip will remove empty spaces and newlines
        # token.pos (part of speech) will remove punctuation and miscellaneous symbols from the document
        if token.text.strip() and not token.is_stop and token.pos_ != "PUNCT" and token.pos != "X":
            # lemma is the root of the word, e.g running, runs, ran will all have a lemma of "run"
            lemma: str = token.lemma_.lower()
            # this updates and fills the lemmas map we made earlier
            lemmas[lemma] = lemmas.get(lemma, 0) + 1

    # sort the lemmas in descending order
    sorted_lemmas = sorted(lemmas.items(), key=lambda item: item[1], reverse=True)
    # print out the top 40 lemmas in the poem
    for (lemma, n) in sorted_lemmas[:40]:
        print(f'{lemma} x {n}')
	# Spacy is a natural language processing tool to aid with quickly building apps. We can use it, meaning we will have
	# to do minimal work on our own
	import spacy

	if __name__ == '__main__':
	# Load the poem from my computer. The text file contains just the content of the poem as is, pasted in with Notepad.
	file = open("C:\\Users\\aanand\\Downloads\\How To Tame a Wild Tongue.txt", "r")
	poem = file.read()
	file.close()

	nlp = spacy.load("en_core_web_sm")
	# trigger analysis of the poem
	doc = nlp(poem)

	# will map each word to the number of times it appears in the poem
	lemmas = dict()

	for token in doc:
	# token.is_stop will remove words like "and", and "the" from the document
	# token.text.strip will remove empty spaces and newlines
	# token.pos (part of speech) will remove punctuation and miscellaneous symbols from the document
	if token.text.strip() and not token.is_stop and token.pos_ != "PUNCT" and token.pos != "X":
	# lemma is the root of the word, e.g running, runs, ran will all have a lemma of "run"
	lemma: str = token.lemma_.lower()
	# this updates and fills the lemmas map we made earlier
	lemmas[lemma] = lemmas.get(lemma, 0) + 1

	# sort the lemmas in descending order
	sorted_lemmas = sorted(lemmas.items(), key=lambda item: item[1], reverse=True)
	# print out the top 40 lemmas in the poem
	for (lemma, n) in sorted_lemmas[:40]:
	print(f'{lemma} x {n}')