Skip to content

Instantly share code, notes, and snippets.

@devdave
Created December 1, 2022 18:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save devdave/3868ca58d639f7705ae37b391712a057 to your computer and use it in GitHub Desktop.
Save devdave/3868ca58d639f7705ae37b391712a057 to your computer and use it in GitHub Desktop.
simple script to summarize the contents of a directory of docx files
from pathlib import Path
from collections import Counter
from docx import Document # pip install python-docx
from nltk.tokenize import sent_tokenize, word_tokenize # pip install nltk==3.5
ROOT = Path(r"C:\\REDACTED\\book\\")
class DocCounter:
def __init__(self, root_path: Path):
self.root = root_path
self.counts = Counter([
"sentences",
"unique_words",
"tokens"])
@staticmethod
def get_body(path: Path):
doc = Document(path)
contents = []
for para in doc.paragraphs:
#Collect non-blank lines, the tokenizer doesn't care about asthetics
if len(para.text.strip()) > 0:
contents.append(para.text)
# joining by newline fucks up sentence tokenizing.
return " ".join(contents)
def count_body(self, body: str):
sentences = sent_tokenize(body)
raw = word_tokenize(body)
unique = set(raw)
self.counts["sentences"] += len(sentences)
self.counts["tokens"] += len(raw)
self.counts["unique_words"] = len(unique)
return len(sentences), len(raw), len(unique)
def run(self):
for element in self.root.iterdir() :
if element.is_file() and (element.name.endswith(".docx") and not element.name.startswith("~")):
body = self.get_body(element)
print(f"Processing: {element.name}")
sent_ct, raw_ct, unique = self.count_body(body)
print(f"\tSentences: {sent_ct:,}, Tokens: {raw_ct:,} Unique: {unique:,}")
print()
return self.counts
def main():
counts = DocCounter(ROOT).run()
print()
print(f"Totals -> Sentences: {counts['sentences']:,} Tokens: {counts['tokens']:,} Unique words: {counts['unique_words']:,} ")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment