Skip to content

Instantly share code, notes, and snippets.

@nekopanic
Created August 30, 2012 19:33
Show Gist options
  • Save nekopanic/3538695 to your computer and use it in GitHub Desktop.
Save nekopanic/3538695 to your computer and use it in GitHub Desktop.
Is the python documentation in British English or American English
#!/usr/bin/env python
# Try to determine if a pile of text is American English or British English.
# Give it a list of files on standard input. e.g. find /usr/share/doc/python-doc | ./puddlejump.py
import fileinput
from os import path
from HTMLParser import HTMLParser
files = []
all_words = []
american_word_count = 0
british_word_count = 0
#
# Load the file list and dictionaries
#
for line in fileinput.input():
filename = line.strip()
if path.exists(filename) and path.isfile(filename):
files.append(filename)
print("Examining {0} files".format(len(files)))
american_english = {}
for word in fileinput.input('/usr/share/dict/american-english'):
american_english[word.strip()] = True
print("Loaded {0} words of American english".format(len(american_english)))
british_english = {}
for word in fileinput.input('/usr/share/dict/british-english'):
british_english[word.strip()] = True
print("Loaded {0} words of British english".format(len(british_english)))
#
# This HTML Parser will help with the HTML docs
#
class HTMLWordParser(HTMLParser):
def __init__(self, all_words):
self.all_words = all_words
HTMLParser.__init__(self)
def handle_data(self, data):
self.all_words += data.split()
html_parser = HTMLWordParser(all_words)
#
# Now count the words and print the result!
#
for filename in files:
print("Examining {0}".format(filename))
fd = open(filename)
if filename[-5:] == '.html' or filename[-4:] == '.htm':
html_parser.feed(fd.read())
elif filename[-4:] == '.txt':
all_words += fd.read().split()
fd.close()
for word in all_words:
if american_english.has_key(word):
american_word_count += 1
if british_english.has_key(word):
british_word_count += 1
print("Of {0} words, {1} were American English and {2} were British English".format(len(all_words), american_word_count, british_word_count))
if american_word_count > british_word_count:
print("America wins by {0} words".format(american_word_count - british_word_count))
if british_word_count > american_word_count:
print("Britain wins by {0} words".format(british_word_count - american_word_count))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment