fish2000/readability.py

## readability.py
#!/usr/bin/env python
# encoding: utf-8

#####
##### Readability.py -- by fish. © 2009, Some Rights Reserved But Some May Be
##### http://objectsinspaceandtime.com/
#####
##### it's a python port of the Readability JavaScript bookmarklet,
##### by Arc90 Labs --
##### http://lab.arc90.com/2009/03/readability.php
##### http://code.google.com/p/arc90labs-readability/
##### this script and the original are both licenced under the Apache License 2.0.
#####

import sys
import os
import os.path
import re
import getopt
import urllib2
import urlparse
import lxml
from pyquery import PyQuery as pq
from lxml import etree

urls = []
roster = {}
outpaths = {}
thisURL = ""
breakre = re.compile('<br/?>[ \r\n\s]*<br/?>', re.IGNORECASE | re.MULTILINE)
fontre = re.compile('<\/?font[^>]*>', re.IGNORECASE | re.MULTILINE)
fuckingbodyre = re.compile('<\/?body[^>]*?>', re.IGNORECASE)
multibreakre = re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}', re.IGNORECASE | re.MULTILINE)
badnamere = re.compile('(comment|meta|footer|footnote|side)', re.IGNORECASE)
goodclsre = re.compile('((^|\\s)(post|hentry|main|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))', re.IGNORECASE)
goodidre = re.compile('^(post|hentry|main|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$', re.IGNORECASE)
readabilityversion = "0.4"
emailsrc = 'http://proto1.arc90.com/readability/email.php'
iframeLoads = 0

def main(argv):
	view = False
	toview = []
	textout = False
	htmlout = False
	pageout = True
	termout = True
	basepath = os.getcwd()

	try:
		opts, args = getopt.getopt(argv, "wo:thpv", [
			"wtf",
			"out=",
			"text",
			"html",
			"page",
			"view",
		])
	except getopt.GetoptError:
		print "Error: Bad GetOpt"
		echousage()
		sys.exit(2)
	for opt, arg in opts:
		if opt in ("-w", "--wtf"):
			echousage()
			sys.exit(0)
		elif opt in ("-o", "--out"):
			if os.path.exists(arg):
				basepath = os.path.realpath(arg)
		elif opt in ("-t", "--text"):
			textout = True
		elif opt in ("-h", "--html"):
			htmlout = True
		elif opt in ("-p", "--page"):
			pageout = True
		elif opt in ("-v", "--view"):
			view = True
	if (len(args) > 0):
		for urlarg in args:
			if (urlarg.startswith('http://')):
				urls.append(urlarg)
			else:
				print "WARNING: Bad URL: "+unicode(urlarg).encode("utf-8")
	else:
		urls.append('http://objectsinspaceandtime.com/')

	for url in urls:
		urlp = urlparse.urlparse(url)
		urlb = urlparse.urlsplit(url)
		fileout = os.path.splitext(os.path.basename(urlp.path))[0]
		if (fileout == "" or fileout == None):
			fileout = re.sub("\.", "_", urlb.netloc)
		if (fileout == "" or fileout == None):
			fileout = "unknown"
		if (fileout == "" or fileout == None):
			fileout = os.path.splitext(os.path.dirname(urlb.path))[0]

		print "\nURL: "+unicode(url).encode("utf-8")
		htmlobject = urllib2.urlopen(url)
		thehtml = htmlobject.read()
		(thediv, cleanpage) = grabarticle(thehtml, url)

		if (textout):
			txtfile(os.path.join(basepath, (fileout+"_extract.txt")), thediv.text())
			toview.append(os.path.join(basepath, (fileout+"_extract.txt")))
		if (htmlout):
			txtfile(os.path.join(basepath, (fileout+"_extract.html")), thediv.html())
			toview.append(os.path.join(basepath, (fileout+"_extract.html")))
		if (pageout):
			txtfile(os.path.join(basepath, (fileout+"_readable.html")), readability(thediv, cleanpage, theurlbase=str(urlb.geturl())))
			toview.append(os.path.join(basepath, (fileout+"_readable.html")))

	if (view and len(toview) > 0):
		# not cross-platform at all
		os.execv('/usr/bin/open', (['-a', '/Applications/Safari.app'] + toview))

def grabarticle(html, url):
	global thisURL
	thisURL = ""+url
	if url not in roster:
		roster[url] = {}
	else:
		roster[url] = {}

	### ORIGINAL COMMENT: Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
	nhtml = breakre.sub('</p><p>', html)
	mhtml = fontre.sub('', nhtml)

	d = pq(mhtml, parser='html')

	allparagraphs = d('p, blockquote, span')
	if len(allparagraphs) < 1:
		allparagraphs = d('span, div')

	topdivcount = 0
	td = None
	topdiv = None

	### ORIGINAL COMMENT: Study all the paragraphs and find the chunk that has the best score.
	### ORIGINAL COMMENT: A score is determined by things like: Number of <p>'s, commas, special classes, etc.
	try:
		allparagraphs.map(assignscore)
	except KeyError:
		# but why?
		print 'KEY ERROR!'

	print "\nScoreboard Results:"
	for sk, sv in roster[url].iteritems():
		print "Object: "+str(sk)+"\t\tScore: "+str(sv)
	print "\n"

	### ORIGINAL COMMENT: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
	scoreboard = roster[url]
	for tk in scoreboard.iterkeys():
		if (td == None):
			td = tk
		elif (scoreboard[tk] > scoreboard[td]):
			td = tk
	if td == None:
		out = "ERROR: This page sucks. The content from: " + str(url) + "... was totally unreadable."
		out += "\nroster:"
		out += unicode(roster).encode("utf-8")
		print out
		return (pq("<div><h1>"+out+"</h1></div>"), url)
	else:
		topdiv = pq(td).clone()

	### ORIGINAL COMMENT: REMOVES ALL STYLESHEETS ...
	#d('head').find('link').filter(lambda i: pq(this).attr('rel') == 'stylesheet').remove()
	d('head').find('link').remove()

	### ORIGINAL COMMENT: Remove all style tags in head (not doing this on IE) :
	d('head').find('style').remove()
	d('head').find('script').remove()

	### ORIGINAL COMMENT: Removes all style attributes
	topdiv.find("*").attr('style', 'void:none;')
	topdiv.find("*").removeAttr('style')

	### ORIGINAL COMMENT: Goes in and removes DIV's that have more non <p> stuff than <p> stuff
	topdiv.find('div').each(killdivs)
	topdiv.find('script').remove()
	d.find('div').each(killdivs)
	d.find('script').remove()

	### ORIGINAL COMMENT: Removes any consecutive <br />'s into just one <br />
	#hh = multibreakre.sub(topdiv.html(), '<br />')
	hh = multibreakre.sub('<br />', topdiv.html())
	topdiv = pq("<div></div>").html(hh)

	### ORIGINAL COMMENT: Cleans out junk from the topDiv just in case:
	topdiv.find('form').each(cleanit)
	topdiv.find('object').each(cleanit)
	topdiv.find('table').each(cleanit)
	topdiv.find('h1').each(cleanit)
	topdiv.find('h2').each(cleanit)
	topdiv.find('iframe').each(cleanit)
	d.find('form').each(cleanit)
	d.find('object').each(cleanit)
	d.find('table').each(cleanit)
	d.find('h1').each(cleanit)
	d.find('h2').each(cleanit)
	d.find('iframe').each(cleanit)


	### ORIGINAL COMMENT: Add the footer and contents:
	return (topdiv, d)

## function for each call to assign score
def calculatescore(dompart, itsmyfirsttime):
	### ORIGINAL COMMENT: Initialize readability data
	ppart = dompart.getparent()
	pscore = 0

	if itsmyfirsttime:
		### ORIGINAL COMMENT: Look for a special classname
		thecls = pq(ppart).attr('class')
		theid = pq(ppart).attr('id')
		if (thecls != None):
			if (badnamere.search(thecls) != None):
				pscore -= 50
			elif (goodclsre.search(thecls) != None):
				pscore += 25
		### ORIGINAL COMMENT: Look for a special ID
		if (theid != None):
			if (badnamere.search(pq(ppart).attr('id')) != None):
				pscore -= 50
			elif (goodidre.search(pq(ppart).attr('id')) != None):
				pscore += 25
	### ORIGINAL COMMENT: Add a point for the paragraph found
	if len(pq(dompart).text()) > 10:
		pscore += 1
	### ORIGINAL COMMENT: Add points for any commas within this paragraph
	pscore += getcharcount(dompart)
	return pscore

def assignscore(idx, dompart):
	global thisURL
	ppart = dompart.getparent()
	scoreboard = roster[thisURL]
	virginscore = (ppart not in scoreboard)
	iscored = calculatescore(dompart, virginscore)
	if virginscore:
		scoreboard[ppart] = iscored
	else:
		scoreboard[ppart] += iscored

### ORIGINAL COMMENT: Get character count
def getcharcount(dompart, *args):
	metric = ","
	if len(args) > 1:
		metric = str(args[0])
	return pq(dompart).text().count(metric)

def killdivs(thing):
	### ORIGINAL COMMENT: If the number of commas is less than 10 (bad sign) ...
	if getcharcount(thing) < 10:
		dp = thing
		ip = len(dp.find('p'))
		iimg = len(dp.find('img'))
		ili = len(dp.find('li'))
		ia = len(dp.find('a'))
		iembed = len(dp.find('embed'))
		if (iimg > ip or ili > ip or ia > ip or ip == 0 or iembed > 0):
			thing.remove()

def cleanit(thing):
	### ORIGINAL COMMENT: If the text content isn't laden with words, remove the child:
	minwords = 10000000
	if thing.is_("table"):
		minwords = 250
	if getcharcount(thing, " ") < minwords:
		thing.remove()

def readability(rdiv, rcontext, *args, **kwargs):
	toolshtml = """
	<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>
	<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>
	<a href='#' onclick='emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"""
	readheadhtml = """
	<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability.css' media='screen' type='text/css' />
	<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability-print.css' media='print' type='text/css' />"""
	readfooterhtml = """
		<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png' /></a>
			<div class='footer-right'><span class='version'>Readability version readabilityVersion</span></div>"""

	readstyle = 'style-newspaper'
	readsize = 'size-large'
	readmargin = 'margin-wide'

	overlay = pq('<div id="readOverlay"></div>')
	innerdiv = pq('<div id="readInner"></div>')
	articletools = pq('<div id="readTools"></div>')
	articlecontent = pq('<div></div>')
	articletitle = pq('<h1>'+unicode(rcontext('title').text()).encode("utf-8")+'</h1>')
	articlefooter = pq('<div id="readFooter"></div>')

	### ORIGINAL COMMENT: Grab the title from the <title> tag and inject it as the title.
	articlefooter.html(readfooterhtml)
	articletools.html(toolshtml)

	articlecontent.append((rdiv))
	innerdiv.addClass(readmargin).addClass(readsize)
	innerdiv.append((articletitle))
	innerdiv.append((articlecontent))
	overlay.addClass(readstyle)

	if len(rcontext.find('body')) < 1:
		rcontext.append(pq('<body></body>'))
	rcontext.find('body').empty()
	rcontext.find('head').eq(0).append((readheadhtml))
	rcontext.find('body').eq(0).append((innerdiv))
	rcontext.find('body').eq(0).append((articletools))
	rcontext.find('body').eq(0).addClass(readstyle)
	rcontext.find('body').eq(0).append((articlefooter))

	if (kwargs['theurlbase'] != None):
		tbb = kwargs['theurlbase']
		rcontext.make_links_absolute(base_url=tbb)
		rcontext.find('img').each(lambda aimg: aimg.attr('src').startswith('http://') and (1) or (aimg.attr('src', urlparse.urljoin(tbb, aimg.attr('src')))))

	return rcontext

def txtfile(where, what):
	if os.path.exists(os.path.split(where)[0]):
		print "Writing file %s..." % where
		fff = open(where, 'w')
		fff.write(unicode(what).encode("utf-8"))
		fff.close()
		return True
	else:
		return False

def echousage():
	print "\n"
	print "readability.py 0.1. (c) 2009 Fish. All rights reserved."
	print "\thttp://objectsinspaceandtime.com/"
	print "Adapted from the Readability JavaScript bookmarklet by Arc90 Labs:"
	print "\thttp://lab.arc90.com/2009/03/readability.php"
	print "Usage:"
	print "%s [-wothpv] url [url, url ...]" % __file__
	print "\t-w\t--wtf\t\tPrint this message"
	print "\t-o DIR\t--output=DIR\tSpecify a path for the output files"
	print "\t-t\t--text\t\tSave text extract as a text file"
	print "\t-h\t--html\t\tSave text extract as an HTML file"
	print "\t-p\t--page\t\tSave text extract within a Readability(tm) HTML page"
	print "\t-v\t--view\t\tView HTML files in the browser after generation"
	print "\n"

if __name__ == '__main__':
	main(sys.argv[1:])
	#!/usr/bin/env python
	# encoding: utf-8

	#####
	##### Readability.py -- by fish. © 2009, Some Rights Reserved But Some May Be
	##### http://objectsinspaceandtime.com/
	#####
	##### it's a python port of the Readability JavaScript bookmarklet,
	##### by Arc90 Labs --
	##### http://lab.arc90.com/2009/03/readability.php
	##### http://code.google.com/p/arc90labs-readability/
	##### this script and the original are both licenced under the Apache License 2.0.
	#####

	import sys
	import os
	import os.path
	import re
	import getopt
	import urllib2
	import urlparse
	import lxml
	from pyquery import PyQuery as pq
	from lxml import etree

	urls = []
	roster = {}
	outpaths = {}
	thisURL = ""
	breakre = re.compile('<br/?>[ \r\n\s]*<br/?>', re.IGNORECASE \| re.MULTILINE)
	fontre = re.compile('<\/?font[^>]*>', re.IGNORECASE \| re.MULTILINE)
	fuckingbodyre = re.compile('<\/?body[^>]*?>', re.IGNORECASE)
	multibreakre = re.compile('(<br\s\/?>(\s\| ?)){1,}', re.IGNORECASE \| re.MULTILINE)
	badnamere = re.compile('(comment\|meta\|footer\|footnote\|side)', re.IGNORECASE)
	goodclsre = re.compile('((^\|\\s)(post\|hentry\|main\|entry[-]?(content\|text\|body)?\|article[-]?(content\|text\|body)?)(\\s\|$))', re.IGNORECASE)
	goodidre = re.compile('^(post\|hentry\|main\|entry[-]?(content\|text\|body)?\|article[-]?(content\|text\|body)?)$', re.IGNORECASE)
	readabilityversion = "0.4"
	emailsrc = 'http://proto1.arc90.com/readability/email.php'
	iframeLoads = 0

	def main(argv):
	view = False
	toview = []
	textout = False
	htmlout = False
	pageout = True
	termout = True
	basepath = os.getcwd()

	try:
	opts, args = getopt.getopt(argv, "wo:thpv", [
	"wtf",
	"out=",
	"text",
	"html",
	"page",
	"view",
	])
	except getopt.GetoptError:
	print "Error: Bad GetOpt"
	echousage()
	sys.exit(2)
	for opt, arg in opts:
	if opt in ("-w", "--wtf"):
	echousage()
	sys.exit(0)
	elif opt in ("-o", "--out"):
	if os.path.exists(arg):
	basepath = os.path.realpath(arg)
	elif opt in ("-t", "--text"):
	textout = True
	elif opt in ("-h", "--html"):
	htmlout = True
	elif opt in ("-p", "--page"):
	pageout = True
	elif opt in ("-v", "--view"):
	view = True
	if (len(args) > 0):
	for urlarg in args:
	if (urlarg.startswith('http://')):
	urls.append(urlarg)
	else:
	print "WARNING: Bad URL: "+unicode(urlarg).encode("utf-8")
	else:
	urls.append('http://objectsinspaceandtime.com/')

	for url in urls:
	urlp = urlparse.urlparse(url)
	urlb = urlparse.urlsplit(url)
	fileout = os.path.splitext(os.path.basename(urlp.path))[0]
	if (fileout == "" or fileout == None):
	fileout = re.sub("\.", "_", urlb.netloc)
	if (fileout == "" or fileout == None):
	fileout = "unknown"
	if (fileout == "" or fileout == None):
	fileout = os.path.splitext(os.path.dirname(urlb.path))[0]

	print "\nURL: "+unicode(url).encode("utf-8")
	htmlobject = urllib2.urlopen(url)
	thehtml = htmlobject.read()
	(thediv, cleanpage) = grabarticle(thehtml, url)

	if (textout):
	txtfile(os.path.join(basepath, (fileout+"_extract.txt")), thediv.text())
	toview.append(os.path.join(basepath, (fileout+"_extract.txt")))
	if (htmlout):
	txtfile(os.path.join(basepath, (fileout+"_extract.html")), thediv.html())
	toview.append(os.path.join(basepath, (fileout+"_extract.html")))
	if (pageout):
	txtfile(os.path.join(basepath, (fileout+"_readable.html")), readability(thediv, cleanpage, theurlbase=str(urlb.geturl())))
	toview.append(os.path.join(basepath, (fileout+"_readable.html")))

	if (view and len(toview) > 0):
	# not cross-platform at all
	os.execv('/usr/bin/open', (['-a', '/Applications/Safari.app'] + toview))

	def grabarticle(html, url):
	global thisURL
	thisURL = ""+url
	if url not in roster:
	roster[url] = {}
	else:
	roster[url] = {}

	### ORIGINAL COMMENT: Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
	nhtml = breakre.sub('</p><p>', html)
	mhtml = fontre.sub('', nhtml)

	d = pq(mhtml, parser='html')

	allparagraphs = d('p, blockquote, span')
	if len(allparagraphs) < 1:
	allparagraphs = d('span, div')

	topdivcount = 0
	td = None
	topdiv = None

	### ORIGINAL COMMENT: Study all the paragraphs and find the chunk that has the best score.
	### ORIGINAL COMMENT: A score is determined by things like: Number of <p>'s, commas, special classes, etc.
	try:
	allparagraphs.map(assignscore)
	except KeyError:
	# but why?
	print 'KEY ERROR!'

	print "\nScoreboard Results:"
	for sk, sv in roster[url].iteritems():
	print "Object: "+str(sk)+"\t\tScore: "+str(sv)
	print "\n"

	### ORIGINAL COMMENT: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
	scoreboard = roster[url]
	for tk in scoreboard.iterkeys():
	if (td == None):
	td = tk
	elif (scoreboard[tk] > scoreboard[td]):
	td = tk
	if td == None:
	out = "ERROR: This page sucks. The content from: " + str(url) + "... was totally unreadable."
	out += "\nroster:"
	out += unicode(roster).encode("utf-8")
	print out
	return (pq("<div><h1>"+out+"</h1></div>"), url)
	else:
	topdiv = pq(td).clone()

	### ORIGINAL COMMENT: REMOVES ALL STYLESHEETS ...
	#d('head').find('link').filter(lambda i: pq(this).attr('rel') == 'stylesheet').remove()
	d('head').find('link').remove()

	### ORIGINAL COMMENT: Remove all style tags in head (not doing this on IE) :
	d('head').find('style').remove()
	d('head').find('script').remove()

	### ORIGINAL COMMENT: Removes all style attributes
	topdiv.find("*").attr('style', 'void:none;')
	topdiv.find("*").removeAttr('style')

	### ORIGINAL COMMENT: Goes in and removes DIV's that have more non <p> stuff than <p> stuff
	topdiv.find('div').each(killdivs)
	topdiv.find('script').remove()
	d.find('div').each(killdivs)
	d.find('script').remove()

	### ORIGINAL COMMENT: Removes any consecutive <br />'s into just one <br />
	#hh = multibreakre.sub(topdiv.html(), '<br />')
	hh = multibreakre.sub('<br />', topdiv.html())
	topdiv = pq("<div></div>").html(hh)

	### ORIGINAL COMMENT: Cleans out junk from the topDiv just in case:
	topdiv.find('form').each(cleanit)
	topdiv.find('object').each(cleanit)
	topdiv.find('table').each(cleanit)
	topdiv.find('h1').each(cleanit)
	topdiv.find('h2').each(cleanit)
	topdiv.find('iframe').each(cleanit)
	d.find('form').each(cleanit)
	d.find('object').each(cleanit)
	d.find('table').each(cleanit)
	d.find('h1').each(cleanit)
	d.find('h2').each(cleanit)
	d.find('iframe').each(cleanit)


	### ORIGINAL COMMENT: Add the footer and contents:
	return (topdiv, d)

	## function for each call to assign score
	def calculatescore(dompart, itsmyfirsttime):
	### ORIGINAL COMMENT: Initialize readability data
	ppart = dompart.getparent()
	pscore = 0

	if itsmyfirsttime:
	### ORIGINAL COMMENT: Look for a special classname
	thecls = pq(ppart).attr('class')
	theid = pq(ppart).attr('id')
	if (thecls != None):
	if (badnamere.search(thecls) != None):
	pscore -= 50
	elif (goodclsre.search(thecls) != None):
	pscore += 25
	### ORIGINAL COMMENT: Look for a special ID
	if (theid != None):
	if (badnamere.search(pq(ppart).attr('id')) != None):
	pscore -= 50
	elif (goodidre.search(pq(ppart).attr('id')) != None):
	pscore += 25
	### ORIGINAL COMMENT: Add a point for the paragraph found
	if len(pq(dompart).text()) > 10:
	pscore += 1
	### ORIGINAL COMMENT: Add points for any commas within this paragraph
	pscore += getcharcount(dompart)
	return pscore

	def assignscore(idx, dompart):
	global thisURL
	ppart = dompart.getparent()
	scoreboard = roster[thisURL]
	virginscore = (ppart not in scoreboard)
	iscored = calculatescore(dompart, virginscore)
	if virginscore:
	scoreboard[ppart] = iscored
	else:
	scoreboard[ppart] += iscored

	### ORIGINAL COMMENT: Get character count
	def getcharcount(dompart, *args):
	metric = ","
	if len(args) > 1:
	metric = str(args[0])
	return pq(dompart).text().count(metric)

	def killdivs(thing):
	### ORIGINAL COMMENT: If the number of commas is less than 10 (bad sign) ...
	if getcharcount(thing) < 10:
	dp = thing
	ip = len(dp.find('p'))
	iimg = len(dp.find('img'))
	ili = len(dp.find('li'))
	ia = len(dp.find('a'))
	iembed = len(dp.find('embed'))
	if (iimg > ip or ili > ip or ia > ip or ip == 0 or iembed > 0):
	thing.remove()

	def cleanit(thing):
	### ORIGINAL COMMENT: If the text content isn't laden with words, remove the child:
	minwords = 10000000
	if thing.is_("table"):
	minwords = 250
	if getcharcount(thing, " ") < minwords:
	thing.remove()

	def readability(rdiv, rcontext, args, *kwargs):
	toolshtml = """
	<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>
	<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>
	<a href='#' onclick='emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"""
	readheadhtml = """
	<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability.css' media='screen' type='text/css' />
	<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability-print.css' media='print' type='text/css' />"""
	readfooterhtml = """
	<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png' /></a>
	<div class='footer-right'><span class='version'>Readability version readabilityVersion</span></div>"""

	readstyle = 'style-newspaper'
	readsize = 'size-large'
	readmargin = 'margin-wide'

	overlay = pq('<div id="readOverlay"></div>')
	innerdiv = pq('<div id="readInner"></div>')
	articletools = pq('<div id="readTools"></div>')
	articlecontent = pq('<div></div>')
	articletitle = pq('<h1>'+unicode(rcontext('title').text()).encode("utf-8")+'</h1>')
	articlefooter = pq('<div id="readFooter"></div>')

	### ORIGINAL COMMENT: Grab the title from the <title> tag and inject it as the title.
	articlefooter.html(readfooterhtml)
	articletools.html(toolshtml)

	articlecontent.append((rdiv))
	innerdiv.addClass(readmargin).addClass(readsize)
	innerdiv.append((articletitle))
	innerdiv.append((articlecontent))
	overlay.addClass(readstyle)

	if len(rcontext.find('body')) < 1:
	rcontext.append(pq('<body></body>'))
	rcontext.find('body').empty()
	rcontext.find('head').eq(0).append((readheadhtml))
	rcontext.find('body').eq(0).append((innerdiv))
	rcontext.find('body').eq(0).append((articletools))
	rcontext.find('body').eq(0).addClass(readstyle)
	rcontext.find('body').eq(0).append((articlefooter))

	if (kwargs['theurlbase'] != None):
	tbb = kwargs['theurlbase']
	rcontext.make_links_absolute(base_url=tbb)
	rcontext.find('img').each(lambda aimg: aimg.attr('src').startswith('http://') and (1) or (aimg.attr('src', urlparse.urljoin(tbb, aimg.attr('src')))))

	return rcontext

	def txtfile(where, what):
	if os.path.exists(os.path.split(where)[0]):
	print "Writing file %s..." % where
	fff = open(where, 'w')
	fff.write(unicode(what).encode("utf-8"))
	fff.close()
	return True
	else:
	return False

	def echousage():
	print "\n"
	print "readability.py 0.1. (c) 2009 Fish. All rights reserved."
	print "\thttp://objectsinspaceandtime.com/"
	print "Adapted from the Readability JavaScript bookmarklet by Arc90 Labs:"
	print "\thttp://lab.arc90.com/2009/03/readability.php"
	print "Usage:"
	print "%s [-wothpv] url [url, url ...]" % __file__
	print "\t-w\t--wtf\t\tPrint this message"
	print "\t-o DIR\t--output=DIR\tSpecify a path for the output files"
	print "\t-t\t--text\t\tSave text extract as a text file"
	print "\t-h\t--html\t\tSave text extract as an HTML file"
	print "\t-p\t--page\t\tSave text extract within a Readability(tm) HTML page"
	print "\t-v\t--view\t\tView HTML files in the browser after generation"
	print "\n"

	if __name__ == '__main__':
	main(sys.argv[1:])