Skip to content

Instantly share code, notes, and snippets.

@oyvindio
Created February 19, 2010 14:16
Show Gist options
  • Save oyvindio/308730 to your computer and use it in GitHub Desktop.
Save oyvindio/308730 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Filling out forms is hard, let's do some Python.
This program determines the symbol and word count excluding markup for
a (X)HTML document, by passing a POST request to the count_words.php
script at the INF5270 web site.
Note that there is NO error handling whatsoever, so use at your own peril.
Usage:
./diw-wc.py file.html
cat file.html | ./diw-wc.py
Dependencies:
BeautifulSoup; on Debian/Ubuntu, sudo aptitude install python-beautifulsoup,
or pull it from PyPi with pip install BeautifulSoup or
easy_install BeautifulSoup
"""
from sys import argv, stdin
from urllib import urlencode
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
import re
def getCounts(html):
"""
Fetch symbol and word counts, excluding (X)HTML tags from the
script at the INF5270 web site.
"""
site = urlopen("http://www.ifi.uio.no/~inf5270/ownstuff/count_words.php",
urlencode({"body": html}))
soup = BeautifulSoup(site.read())
return tuple(soup.findAll(text=re.compile("^[0-9]+$")))
if __name__ == "__main__":
if len(argv) == 1:
inputHTML = "".join([line for line in stdin])
else:
inputHTML = "".join([line for line in open(argv[1])])
counts = getCounts(inputHTML)
print "{0} symbols, {1} words".format(counts[0], counts[1])
(defun diw-word-count ()
"Count symbols and words in buffer, excluding (X)HTML markup"
(interactive)
(shell-command-on-region (point-min) (point-max) "diw-wc.py"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment