Script to analyze blog posts I wrote
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sys import argv | |
from os import listdir | |
from re import split | |
from collections import Counter | |
class Post(object): | |
def __init__(self, title, content): | |
self.title = title | |
self.content = map(lambda w: w.lower(), content) | |
def process_posts(post_dir): | |
posts = [] | |
for post_file in listdir(post_dir): | |
fullname = post_dir + "/" + post_file | |
with open(fullname) as pf: | |
posts.append(Post(post_file, split("\W+", pf.read()))) | |
posts.sort(key = lambda p: len(p.content)) | |
print "The shortest posts are: " | |
for i in [0, 1, 2]: | |
print "{} with {} words".format(posts[i].title, len(posts[i].content)) | |
print "The longest posts are: " | |
for i in [-1, -2, -3]: | |
print "{} with {} words".format(posts[i].title, len(posts[i].content)) | |
total_post_length = 0.0 | |
unique_words = set() | |
word_counter = Counter() | |
for p in posts: | |
total_post_length += len(p.content) | |
word_counter.update(p.content) | |
for word in p.content: | |
unique_words.add(word) | |
print "The average post length is {} words".format(total_post_length / len(posts)) | |
print "The number of words written is {}".format(total_post_length) | |
print "The number of unique words used is {}".format(len(unique_words)) | |
print "The 100 most used words are {}".format(word_counter.most_common(100)) | |
if __name__ == "__main__": | |
process_posts(argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment