kk6/get-all-text.py

## get-all-text.py
#! /usr/bin/env python
#-*- coding:utf-8 -*-

import re
import urllib2
import contextlib
import collections
from pyquery import PyQuery as pq


def get_html(url, enc='utf-8'):
    with contextlib.closing(urllib2.urlopen(url)) as html:
        html = html.read().decode(enc)
        return html

def extract_text(source):
    Newlines = re.compile(r'[\r\n]\s+')
    d = pq(source)
    body = d("body")
    text = body.text()
    return Newlines.sub('\n', text)

def set_textfile(text, enc='utf-8'):
    with open('output.txt', 'w') as f:
        words = text.encode(enc).split(' ')
        c = collections.Counter(words)
        for word in c.most_common():
            f.write("{0:<30}:{1:>3}\n".format(word[0], word[1]))

if __name__ == '__main__':
    url = "http://docs.python.org/library/functions.html"
    source = get_html(url)
    text = extract_text(source)
    set_textfile(text)
	#! /usr/bin/env python
	#-- coding:utf-8 --

	import re
	import urllib2
	import contextlib
	import collections
	from pyquery import PyQuery as pq


	def get_html(url, enc='utf-8'):
	with contextlib.closing(urllib2.urlopen(url)) as html:
	html = html.read().decode(enc)
	return html

	def extract_text(source):
	Newlines = re.compile(r'[\r\n]\s+')
	d = pq(source)
	body = d("body")
	text = body.text()
	return Newlines.sub('\n', text)

	def set_textfile(text, enc='utf-8'):
	with open('output.txt', 'w') as f:
	words = text.encode(enc).split(' ')
	c = collections.Counter(words)
	for word in c.most_common():
	f.write("{0:<30}:{1:>3}\n".format(word[0], word[1]))

	if __name__ == '__main__':
	url = "http://docs.python.org/library/functions.html"
	source = get_html(url)
	text = extract_text(source)
	set_textfile(text)