Skip to content

Instantly share code, notes, and snippets.

@kk6
Created July 30, 2011 13:49
Show Gist options
  • Save kk6/1115543 to your computer and use it in GitHub Desktop.
Save kk6/1115543 to your computer and use it in GitHub Desktop.
英単語カウンター。なんか色々おかしい(´・ω・`)
#! /usr/bin/env python
#-*- coding:utf-8 -*-
import re
import urllib2
import contextlib
import collections
from pyquery import PyQuery as pq
def get_html(url, enc='utf-8'):
with contextlib.closing(urllib2.urlopen(url)) as html:
html = html.read().decode(enc)
return html
def extract_text(source):
Newlines = re.compile(r'[\r\n]\s+')
d = pq(source)
body = d("body")
text = body.text()
return Newlines.sub('\n', text)
def set_textfile(text, enc='utf-8'):
with open('output.txt', 'w') as f:
words = text.encode(enc).split(' ')
c = collections.Counter(words)
for word in c.most_common():
f.write("{0:<30}:{1:>3}\n".format(word[0], word[1]))
if __name__ == '__main__':
url = "http://docs.python.org/library/functions.html"
source = get_html(url)
text = extract_text(source)
set_textfile(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment