Skip to content

Instantly share code, notes, and snippets.

@jmoiron
Created May 5, 2011 00:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmoiron/956284 to your computer and use it in GitHub Desktop.
Save jmoiron/956284 to your computer and use it in GitHub Desktop.
silly stats on amit's blog
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""post stats on amit just because"""
import re
import urllib2
from lxml import html
url = 'http://omlettesoft.com/newjournal.php3?topic=On+the+Waterfront&who=Lord+Omlette'
class Blog(object):
def __init__(self, dom):
self.dom = dom
self.entries = dom.cssselect('div.entry')
self.posts = [Post(e) for e in self.entries]
self.total_count = sum([p.total for p in self.posts])
self.total_om_count = sum([p.om_count for p in self.posts])
self.total_quote_count = sum([p.quote_count for p in self.posts])
self.total_jerm_count = sum([p.jerm_count for p in self.posts])
self.total_jing_count = sum([p.jing_count for p in self.posts])
class Post(object):
def __init__(self, dom):
self.dom = dom
self.process()
def word_count(self, elements):
"""Take a naive word count of a list of elements."""
isword = re.compile('\w+') # a word is anything with letters
return sum([len(filter(None, map(isword.search, e.text_content().split())))
for e in elements])
def process(self):
post = self.dom.cssselect('div.post')[0]
self.total = self.word_count([post])
self.quote_count = self.word_count(post.cssselect('blockquote'))
self.om_count = self.total - self.quote_count
self.jerm_count, self.jing_count = 0, 0
for anchor in post.cssselect('a'):
text = anchor.text_content().strip().lower()
if text == 'jerm9x': self.jerm_count += 1
if text == 'jing': self.jing_count += 1
def load_dom(url):
"""Load a url and return a dom element for that url."""
# for some reason i don't trust lxml to read things off the internets
response = urllib2.urlopen(url)
return html.document_fromstring(response.read())
def main():
opts, args = parse_args()
dom = load_dom(url)
blog = Blog(dom)
print " ** stats for %s ** " % url
print " %d posts with %d total words" % (len(blog.posts), blog.total_count)
print " %d words in blockquote, %d original words (%0.2f%% quotations)" % (
blog.total_quote_count, blog.total_om_count,
100 * float(blog.total_quote_count)/float(blog.total_count)
)
print " %d linked mentions of jerm9x, %d of jing" % (
blog.total_jerm_count, blog.total_jing_count
)
def parse_args():
from optparse import OptionParser
parser = OptionParser(version='1.0', usage='%prog')
return parser.parse_args()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment