Created
June 23, 2021 22:21
-
-
Save kuipumu/4a05ca912d6452de627d73fe881e95e5 to your computer and use it in GitHub Desktop.
Get word count in document (plain text only) excluding words in a dictionary. A document in csv format is going to be created with each word and it's corresponding count.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""wordcount.py | |
Get word count in document (plain text only) excluding words in a dictionary. | |
A document in csv format is going to be created with each word and it's corresponding count. | |
Example: | |
Select a document to count words from using the -i our --input parameter, | |
document must be in a plain text format, add a dictionary to exclude words to be counted | |
using the -d or --dictionary parameter. A file with csv format | |
is going to be created with each word and it's corresponding count, you can also change | |
the name of the output file with the -o or --output parameter. | |
$ python wordcound.py -i alice_in_wonderland.txt -d 1-1000.txt | |
Attributes: | |
document (str): Document to count words from. | |
dictionary (str): Dictionary to exclude word count. | |
occurrences: (dict) Counter containing word occurrences. | |
outputname: (str) Filename of file to export. | |
file: (file-like) Output file with word count results. | |
Returns: | |
A csv file in the current folder with word count results from the specified document. | |
""" | |
from argparse import ArgumentParser | |
from collections import Counter | |
from csv import writer | |
from pathlib import Path | |
from re import findall | |
# Initiate the parser with a description | |
parser = ArgumentParser( | |
description='Get word count in document and filter from words in a dictionary.' | |
) | |
# Add input argument. | |
parser.add_argument("--input", "-i", help="set document to count words from.") | |
# Add output argument (optional). | |
parser.add_argument("--output", "-o", help="set output filename.") | |
# Add dictionary argument (optional). | |
parser.add_argument("--dictionary", "-d", help="set dictionary to filter word count.") | |
# Read arguments from the command line. | |
args = parser.parse_args() | |
# Check for document and dictionary in arguments. | |
if args.input and args.dictionary: | |
# Get document from current folder. | |
document = Path(args.input) | |
if document.exists(): | |
# Get words from document, extract special characters, lowercase, split into a list. | |
words = findall(r"[’']?\b[a-zA-Z0-9]+(?:[’'-][a-zA-Z0-9]+)*\b[’']?", document.read_text().casefold()) | |
# Get all occurrences of words in a Counter. | |
occurrences = Counter(words) | |
# Get words from dictionary in a list. | |
dictionary = Path(args.dictionary).read_text().split() | |
# Extract the words not found in the dictionary from the occurrences. | |
for word in list(occurrences): | |
if word in dictionary: | |
del occurrences[word] | |
# Order occurrences by most common. | |
occurrences = occurrences.most_common() | |
# Invert occurrences tuples to match desired column order. | |
occurrences = [tuple(reversed(item)) for item in occurrences] | |
# Check for output filename in --output argument. | |
if args.output: | |
# Set output filename. | |
outputname = args.output + ".csv" | |
else: | |
# Set default output filename. | |
outputname = document.stem + "_wcount.csv" | |
# Save output to csv file. | |
with open(outputname, "w") as output: | |
file = writer(output) | |
file.writerow(["Count","Word"]) | |
file.writerows(occurrences) | |
print("Results saved to " + str(Path(__file__).resolve().parent) + "/" + outputname) | |
else: | |
print("No valid document found.") | |
else: | |
print("No document or dictionary set to count words from.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment