Skip to content

Instantly share code, notes, and snippets.

@layoaster
Created October 23, 2018 13:38
Show Gist options
  • Save layoaster/ff24a98f4317b55f85cc42c67984b95a to your computer and use it in GitHub Desktop.
Save layoaster/ff24a98f4317b55f85cc42c67984b95a to your computer and use it in GitHub Desktop.
Shakespeare's word frecuencies
#!/usr/bin/env python
# Extract word frequencies out of three shakespeare pieces.
# The following is assumed:
# * No word stemming.
# * Hyphenated compound words are indexed as s single word.
# * Script's characters or titles are not indexed.
# * Script comments within square brackets are indexed.
#
# Usage:
# shakespeare_stats.py [output_filename]
#
# Output format:
# A JSON schema with a breakdown of statistics per processed file and
# the aggregated stats under the `aggregate` attribute. The word indexes
# are alphabetically sorted.
#
# Requires Python 3.6.x
import json
import sys
from collections import Counter, defaultdict
# Input file paths/names
FILE_PATHS = ['kinglear.txt', 'othello.txt', 'romeoandjuliet.txt']
# Default output filename
DEFAULT_FILENAME = 'stats.txt'
# Symbols to take off of words
PUNCTUATION = '\'",.;:{}()&?!|[]'
if __name__ == '__main__':
stats = {'aggregate': Counter()}
if len(sys.argv) > 1:
output_filename = sys.argv[1]
else:
output_filename = DEFAULT_FILENAME
for filename in FILE_PATHS:
words_index = defaultdict(int)
with open(filename, 'r') as input_file:
file_context = input_file.read()
# Splitting in lines to remove newline chars
for line in file_context.splitlines():
# Removing `tabs` and others chars
clean_line = line.replace('\t', ' ').replace('--', ' ')
# Extracting words out of the line
for word in clean_line.split(' '):
# Removing word's punctuation chars
clean_word = word.strip(PUNCTUATION)
if clean_word and not clean_word.isupper():
# Discarding empty strings and
# script's character names/Titles
words_index[clean_word.lower()] += 1
# Aggregating statistics
stats[filename] = words_index
stats['aggregate'].update(words_index)
# Writing stats to output file
with open(output_filename, 'w') as output_file:
json.dump(
stats, output_file, ensure_ascii=False, sort_keys=True, indent=3
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment