Skip to content

Instantly share code, notes, and snippets.

@Carl4
Created February 3, 2023 00:31
Show Gist options
  • Save Carl4/5d7ad5d94f577a06868b77232184e0b3 to your computer and use it in GitHub Desktop.
Save Carl4/5d7ad5d94f577a06868b77232184e0b3 to your computer and use it in GitHub Desktop.
import docx
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from typing import Dict
class WordParser:
"""
A class that parses the text in Microsoft Word documents and tracks keywords using natural language processing techniques.
"""
def __init__(self, file_path: str):
"""
Initialize a WordParser instance with the path to the Microsoft Word document.
:param file_path: The path to the Microsoft Word document
"""
self.file_path = file_path
self.keywords: Dict[str, int] = {}
self.stop_words = set(nltk.corpus.stopwords.words("english"))
def parse_keywords(self):
"""
Parse the text in the Microsoft Word document and track the keywords.
"""
document = docx.Document(self.file_path)
stemmer = PorterStemmer()
for para in document.paragraphs:
words = word_tokenize(para.text)
stemmed_words = [stemmer.stem(word.lower()) for word in words if word.lower() not in self.stop_words]
for word in stemmed_words:
if word in self.keywords:
self.keywords[word] += 1
else:
self.keywords[word] = 1
def get_keywords(self) -> Dict[str, int]:
"""
Return a dictionary of the keywords and their frequency.
:return: A dictionary of the keywords and their frequency
"""
return self.keywords
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment