kuipumu/wordcount.py

## wordcount.py
#!/usr/bin/env python
"""wordcount.py

Get word count in document (plain text only) excluding words in a dictionary.
 A document in csv format is going to be created with each word and it's corresponding count.

Example:
    Select a document to count words from using the -i  our --input parameter,
    document must be in a plain text format, add a dictionary to exclude words to be counted
    using the -d or --dictionary parameter. A file with csv format
    is going to be created with each word and it's corresponding count, you can also change
    the name of the output file with the -o or --output parameter.

        $ python wordcound.py -i alice_in_wonderland.txt -d 1-1000.txt

Attributes:
    document (str): Document to count words from.
    dictionary (str): Dictionary to exclude word count.
    occurrences: (dict) Counter containing word occurrences.
    outputname: (str) Filename of file to export.
    file: (file-like) Output file with word count results.

Returns:
    A csv file in the current folder with word count results from the specified document.
"""

from argparse import ArgumentParser
from collections import Counter
from csv import writer
from pathlib import Path
from re import findall

# Initiate the parser with a description
parser = ArgumentParser(
    description='Get word count in document and filter from words in a dictionary.'
)

# Add input argument.
parser.add_argument("--input", "-i", help="set document to count words from.")

# Add output argument (optional).
parser.add_argument("--output", "-o", help="set output filename.")

# Add dictionary argument (optional).
parser.add_argument("--dictionary", "-d", help="set dictionary to filter word count.")

# Read arguments from the command line.
args = parser.parse_args()

# Check for document and dictionary in arguments.
if args.input and args.dictionary:

    # Get document from current folder.
    document = Path(args.input)

    if document.exists():
        # Get words from document, extract special characters, lowercase, split into a list.
        words = findall(r"[’']?\b[a-zA-Z0-9]+(?:[’'-][a-zA-Z0-9]+)*\b[’']?", document.read_text().casefold())

        # Get all occurrences of words in a Counter.
        occurrences = Counter(words)

        # Get words from dictionary in a list.
        dictionary = Path(args.dictionary).read_text().split()

        # Extract the words not found in the dictionary from the occurrences.
        for word in list(occurrences):
            if word in dictionary:
                del occurrences[word]

        # Order occurrences by most common.
        occurrences = occurrences.most_common()

        # Invert occurrences tuples to match desired column order.
        occurrences = [tuple(reversed(item)) for item in occurrences]

        # Check for output filename in --output argument.
        if args.output:
            # Set output filename.
            outputname = args.output + ".csv"
        else:
            # Set default output filename.
            outputname = document.stem + "_wcount.csv"

        # Save output to csv file.
        with open(outputname, "w") as output:
            file = writer(output)
            file.writerow(["Count","Word"])
            file.writerows(occurrences)
            print("Results saved to " + str(Path(__file__).resolve().parent) + "/" + outputname)
    else:
        print("No valid document found.")
else:
    print("No document or dictionary set to count words from.")
	#!/usr/bin/env python
	"""wordcount.py

	Get word count in document (plain text only) excluding words in a dictionary.
	A document in csv format is going to be created with each word and it's corresponding count.

	Example:
	Select a document to count words from using the -i our --input parameter,
	document must be in a plain text format, add a dictionary to exclude words to be counted
	using the -d or --dictionary parameter. A file with csv format
	is going to be created with each word and it's corresponding count, you can also change
	the name of the output file with the -o or --output parameter.

	$ python wordcound.py -i alice_in_wonderland.txt -d 1-1000.txt

	Attributes:
	document (str): Document to count words from.
	dictionary (str): Dictionary to exclude word count.
	occurrences: (dict) Counter containing word occurrences.
	outputname: (str) Filename of file to export.
	file: (file-like) Output file with word count results.

	Returns:
	A csv file in the current folder with word count results from the specified document.
	"""

	from argparse import ArgumentParser
	from collections import Counter
	from csv import writer
	from pathlib import Path
	from re import findall

	# Initiate the parser with a description
	parser = ArgumentParser(
	description='Get word count in document and filter from words in a dictionary.'
	)

	# Add input argument.
	parser.add_argument("--input", "-i", help="set document to count words from.")

	# Add output argument (optional).
	parser.add_argument("--output", "-o", help="set output filename.")

	# Add dictionary argument (optional).
	parser.add_argument("--dictionary", "-d", help="set dictionary to filter word count.")

	# Read arguments from the command line.
	args = parser.parse_args()

	# Check for document and dictionary in arguments.
	if args.input and args.dictionary:

	# Get document from current folder.
	document = Path(args.input)

	if document.exists():
	# Get words from document, extract special characters, lowercase, split into a list.
	words = findall(r"[’']?\b[a-zA-Z0-9]+(?:[’'-][a-zA-Z0-9]+)*\b[’']?", document.read_text().casefold())

	# Get all occurrences of words in a Counter.
	occurrences = Counter(words)

	# Get words from dictionary in a list.
	dictionary = Path(args.dictionary).read_text().split()

	# Extract the words not found in the dictionary from the occurrences.
	for word in list(occurrences):
	if word in dictionary:
	del occurrences[word]

	# Order occurrences by most common.
	occurrences = occurrences.most_common()

	# Invert occurrences tuples to match desired column order.
	occurrences = [tuple(reversed(item)) for item in occurrences]

	# Check for output filename in --output argument.
	if args.output:
	# Set output filename.
	outputname = args.output + ".csv"
	else:
	# Set default output filename.
	outputname = document.stem + "_wcount.csv"

	# Save output to csv file.
	with open(outputname, "w") as output:
	file = writer(output)
	file.writerow(["Count","Word"])
	file.writerows(occurrences)
	print("Results saved to " + str(Path(__file__).resolve().parent) + "/" + outputname)
	else:
	print("No valid document found.")
	else:
	print("No document or dictionary set to count words from.")