Skip to content

Instantly share code, notes, and snippets.

@karanparikh
Created October 1, 2015 07:06
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save karanparikh/6d98d55bdc5bc8d2bff9 to your computer and use it in GitHub Desktop.
Script to analyze blog posts I wrote
from sys import argv
from os import listdir
from re import split
from collections import Counter
class Post(object):
def __init__(self, title, content):
self.title = title
self.content = map(lambda w: w.lower(), content)
def process_posts(post_dir):
posts = []
for post_file in listdir(post_dir):
fullname = post_dir + "/" + post_file
with open(fullname) as pf:
posts.append(Post(post_file, split("\W+", pf.read())))
posts.sort(key = lambda p: len(p.content))
print "The shortest posts are: "
for i in [0, 1, 2]:
print "{} with {} words".format(posts[i].title, len(posts[i].content))
print "The longest posts are: "
for i in [-1, -2, -3]:
print "{} with {} words".format(posts[i].title, len(posts[i].content))
total_post_length = 0.0
unique_words = set()
word_counter = Counter()
for p in posts:
total_post_length += len(p.content)
word_counter.update(p.content)
for word in p.content:
unique_words.add(word)
print "The average post length is {} words".format(total_post_length / len(posts))
print "The number of words written is {}".format(total_post_length)
print "The number of unique words used is {}".format(len(unique_words))
print "The 100 most used words are {}".format(word_counter.most_common(100))
if __name__ == "__main__":
process_posts(argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment