Skip to content

Instantly share code, notes, and snippets.

@kuipumu
Created June 23, 2021 22:21
Show Gist options
  • Save kuipumu/4a05ca912d6452de627d73fe881e95e5 to your computer and use it in GitHub Desktop.
Save kuipumu/4a05ca912d6452de627d73fe881e95e5 to your computer and use it in GitHub Desktop.
Get word count in document (plain text only) excluding words in a dictionary. A document in csv format is going to be created with each word and it's corresponding count.
#!/usr/bin/env python
"""wordcount.py
Get word count in document (plain text only) excluding words in a dictionary.
A document in csv format is going to be created with each word and it's corresponding count.
Example:
Select a document to count words from using the -i our --input parameter,
document must be in a plain text format, add a dictionary to exclude words to be counted
using the -d or --dictionary parameter. A file with csv format
is going to be created with each word and it's corresponding count, you can also change
the name of the output file with the -o or --output parameter.
$ python wordcound.py -i alice_in_wonderland.txt -d 1-1000.txt
Attributes:
document (str): Document to count words from.
dictionary (str): Dictionary to exclude word count.
occurrences: (dict) Counter containing word occurrences.
outputname: (str) Filename of file to export.
file: (file-like) Output file with word count results.
Returns:
A csv file in the current folder with word count results from the specified document.
"""
from argparse import ArgumentParser
from collections import Counter
from csv import writer
from pathlib import Path
from re import findall
# Initiate the parser with a description
parser = ArgumentParser(
description='Get word count in document and filter from words in a dictionary.'
)
# Add input argument.
parser.add_argument("--input", "-i", help="set document to count words from.")
# Add output argument (optional).
parser.add_argument("--output", "-o", help="set output filename.")
# Add dictionary argument (optional).
parser.add_argument("--dictionary", "-d", help="set dictionary to filter word count.")
# Read arguments from the command line.
args = parser.parse_args()
# Check for document and dictionary in arguments.
if args.input and args.dictionary:
# Get document from current folder.
document = Path(args.input)
if document.exists():
# Get words from document, extract special characters, lowercase, split into a list.
words = findall(r"[’']?\b[a-zA-Z0-9]+(?:[’'-][a-zA-Z0-9]+)*\b[’']?", document.read_text().casefold())
# Get all occurrences of words in a Counter.
occurrences = Counter(words)
# Get words from dictionary in a list.
dictionary = Path(args.dictionary).read_text().split()
# Extract the words not found in the dictionary from the occurrences.
for word in list(occurrences):
if word in dictionary:
del occurrences[word]
# Order occurrences by most common.
occurrences = occurrences.most_common()
# Invert occurrences tuples to match desired column order.
occurrences = [tuple(reversed(item)) for item in occurrences]
# Check for output filename in --output argument.
if args.output:
# Set output filename.
outputname = args.output + ".csv"
else:
# Set default output filename.
outputname = document.stem + "_wcount.csv"
# Save output to csv file.
with open(outputname, "w") as output:
file = writer(output)
file.writerow(["Count","Word"])
file.writerows(occurrences)
print("Results saved to " + str(Path(__file__).resolve().parent) + "/" + outputname)
else:
print("No valid document found.")
else:
print("No document or dictionary set to count words from.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment