layoaster/shakespeare_stats.py

## shakespeare_stats.py
#!/usr/bin/env python

# Extract word frequencies out of three shakespeare pieces.
# The following is assumed:
# * No word stemming.
# * Hyphenated compound words are indexed as s single word.
# * Script's characters or titles are not indexed.
# * Script comments within square brackets are indexed.
#
# Usage:
#    shakespeare_stats.py [output_filename]
#
# Output format:
#    A JSON schema with a breakdown of statistics per processed file and
#    the aggregated stats under the `aggregate` attribute. The word indexes
#    are alphabetically sorted.
#
# Requires Python 3.6.x

import json
import sys
from collections import Counter, defaultdict

# Input file paths/names
FILE_PATHS = ['kinglear.txt', 'othello.txt', 'romeoandjuliet.txt']
# Default output filename
DEFAULT_FILENAME = 'stats.txt'
# Symbols to take off of words
PUNCTUATION = '\'",.;:{}()&?!|[]'


if __name__ == '__main__':
    stats = {'aggregate': Counter()}

    if len(sys.argv) > 1:
        output_filename = sys.argv[1]
    else:
        output_filename = DEFAULT_FILENAME

    for filename in FILE_PATHS:
        words_index = defaultdict(int)

        with open(filename, 'r') as input_file:
            file_context = input_file.read()

        # Splitting in lines to remove newline chars
        for line in file_context.splitlines():
            # Removing `tabs` and others chars
            clean_line = line.replace('\t', ' ').replace('--', ' ')

            # Extracting words out of the line
            for word in clean_line.split(' '):
                # Removing word's punctuation chars
                clean_word = word.strip(PUNCTUATION)

                if clean_word and not clean_word.isupper():
                    # Discarding empty strings and
                    # script's character names/Titles
                    words_index[clean_word.lower()] += 1

        # Aggregating statistics
        stats[filename] = words_index
        stats['aggregate'].update(words_index)

    # Writing stats to output file
    with open(output_filename, 'w') as output_file:
        json.dump(
            stats, output_file, ensure_ascii=False, sort_keys=True, indent=3
        )
	#!/usr/bin/env python

	# Extract word frequencies out of three shakespeare pieces.
	# The following is assumed:
	# * No word stemming.
	# * Hyphenated compound words are indexed as s single word.
	# * Script's characters or titles are not indexed.
	# * Script comments within square brackets are indexed.
	#
	# Usage:
	# shakespeare_stats.py [output_filename]
	#
	# Output format:
	# A JSON schema with a breakdown of statistics per processed file and
	# the aggregated stats under the `aggregate` attribute. The word indexes
	# are alphabetically sorted.
	#
	# Requires Python 3.6.x

	import json
	import sys
	from collections import Counter, defaultdict

	# Input file paths/names
	FILE_PATHS = ['kinglear.txt', 'othello.txt', 'romeoandjuliet.txt']
	# Default output filename
	DEFAULT_FILENAME = 'stats.txt'
	# Symbols to take off of words
	PUNCTUATION = '\'",.;:{}()&?!\|[]'


	if __name__ == '__main__':
	stats = {'aggregate': Counter()}

	if len(sys.argv) > 1:
	output_filename = sys.argv[1]
	else:
	output_filename = DEFAULT_FILENAME

	for filename in FILE_PATHS:
	words_index = defaultdict(int)

	with open(filename, 'r') as input_file:
	file_context = input_file.read()

	# Splitting in lines to remove newline chars
	for line in file_context.splitlines():
	# Removing `tabs` and others chars
	clean_line = line.replace('\t', ' ').replace('--', ' ')

	# Extracting words out of the line
	for word in clean_line.split(' '):
	# Removing word's punctuation chars
	clean_word = word.strip(PUNCTUATION)

	if clean_word and not clean_word.isupper():
	# Discarding empty strings and
	# script's character names/Titles
	words_index[clean_word.lower()] += 1

	# Aggregating statistics
	stats[filename] = words_index
	stats['aggregate'].update(words_index)

	# Writing stats to output file
	with open(output_filename, 'w') as output_file:
	json.dump(
	stats, output_file, ensure_ascii=False, sort_keys=True, indent=3
	)