Skip to content

Instantly share code, notes, and snippets.

@fish2000
Created September 30, 2010 05:51
Show Gist options
  • Save fish2000/604095 to your computer and use it in GitHub Desktop.
Save fish2000/604095 to your computer and use it in GitHub Desktop.
python port of Arc90 readability.js (circa 2009)
#!/usr/bin/env python
# encoding: utf-8
#####
##### Readability.py -- by fish. © 2009, Some Rights Reserved But Some May Be
##### http://objectsinspaceandtime.com/
#####
##### it's a python port of the Readability JavaScript bookmarklet,
##### by Arc90 Labs --
##### http://lab.arc90.com/2009/03/readability.php
##### http://code.google.com/p/arc90labs-readability/
##### this script and the original are both licenced under the Apache License 2.0.
#####
import sys
import os
import os.path
import re
import getopt
import urllib2
import urlparse
import lxml
from pyquery import PyQuery as pq
from lxml import etree
urls = []
roster = {}
outpaths = {}
thisURL = ""
breakre = re.compile('<br/?>[ \r\n\s]*<br/?>', re.IGNORECASE | re.MULTILINE)
fontre = re.compile('<\/?font[^>]*>', re.IGNORECASE | re.MULTILINE)
fuckingbodyre = re.compile('<\/?body[^>]*?>', re.IGNORECASE)
multibreakre = re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}', re.IGNORECASE | re.MULTILINE)
badnamere = re.compile('(comment|meta|footer|footnote|side)', re.IGNORECASE)
goodclsre = re.compile('((^|\\s)(post|hentry|main|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))', re.IGNORECASE)
goodidre = re.compile('^(post|hentry|main|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$', re.IGNORECASE)
readabilityversion = "0.4"
emailsrc = 'http://proto1.arc90.com/readability/email.php'
iframeLoads = 0
def main(argv):
view = False
toview = []
textout = False
htmlout = False
pageout = True
termout = True
basepath = os.getcwd()
try:
opts, args = getopt.getopt(argv, "wo:thpv", [
"wtf",
"out=",
"text",
"html",
"page",
"view",
])
except getopt.GetoptError:
print "Error: Bad GetOpt"
echousage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-w", "--wtf"):
echousage()
sys.exit(0)
elif opt in ("-o", "--out"):
if os.path.exists(arg):
basepath = os.path.realpath(arg)
elif opt in ("-t", "--text"):
textout = True
elif opt in ("-h", "--html"):
htmlout = True
elif opt in ("-p", "--page"):
pageout = True
elif opt in ("-v", "--view"):
view = True
if (len(args) > 0):
for urlarg in args:
if (urlarg.startswith('http://')):
urls.append(urlarg)
else:
print "WARNING: Bad URL: "+unicode(urlarg).encode("utf-8")
else:
urls.append('http://objectsinspaceandtime.com/')
for url in urls:
urlp = urlparse.urlparse(url)
urlb = urlparse.urlsplit(url)
fileout = os.path.splitext(os.path.basename(urlp.path))[0]
if (fileout == "" or fileout == None):
fileout = re.sub("\.", "_", urlb.netloc)
if (fileout == "" or fileout == None):
fileout = "unknown"
if (fileout == "" or fileout == None):
fileout = os.path.splitext(os.path.dirname(urlb.path))[0]
print "\nURL: "+unicode(url).encode("utf-8")
htmlobject = urllib2.urlopen(url)
thehtml = htmlobject.read()
(thediv, cleanpage) = grabarticle(thehtml, url)
if (textout):
txtfile(os.path.join(basepath, (fileout+"_extract.txt")), thediv.text())
toview.append(os.path.join(basepath, (fileout+"_extract.txt")))
if (htmlout):
txtfile(os.path.join(basepath, (fileout+"_extract.html")), thediv.html())
toview.append(os.path.join(basepath, (fileout+"_extract.html")))
if (pageout):
txtfile(os.path.join(basepath, (fileout+"_readable.html")), readability(thediv, cleanpage, theurlbase=str(urlb.geturl())))
toview.append(os.path.join(basepath, (fileout+"_readable.html")))
if (view and len(toview) > 0):
# not cross-platform at all
os.execv('/usr/bin/open', (['-a', '/Applications/Safari.app'] + toview))
def grabarticle(html, url):
global thisURL
thisURL = ""+url
if url not in roster:
roster[url] = {}
else:
roster[url] = {}
### ORIGINAL COMMENT: Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
nhtml = breakre.sub('</p><p>', html)
mhtml = fontre.sub('', nhtml)
d = pq(mhtml, parser='html')
allparagraphs = d('p, blockquote, span')
if len(allparagraphs) < 1:
allparagraphs = d('span, div')
topdivcount = 0
td = None
topdiv = None
### ORIGINAL COMMENT: Study all the paragraphs and find the chunk that has the best score.
### ORIGINAL COMMENT: A score is determined by things like: Number of <p>'s, commas, special classes, etc.
try:
allparagraphs.map(assignscore)
except KeyError:
# but why?
print 'KEY ERROR!'
print "\nScoreboard Results:"
for sk, sv in roster[url].iteritems():
print "Object: "+str(sk)+"\t\tScore: "+str(sv)
print "\n"
### ORIGINAL COMMENT: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
scoreboard = roster[url]
for tk in scoreboard.iterkeys():
if (td == None):
td = tk
elif (scoreboard[tk] > scoreboard[td]):
td = tk
if td == None:
out = "ERROR: This page sucks. The content from: " + str(url) + "... was totally unreadable."
out += "\nroster:"
out += unicode(roster).encode("utf-8")
print out
return (pq("<div><h1>"+out+"</h1></div>"), url)
else:
topdiv = pq(td).clone()
### ORIGINAL COMMENT: REMOVES ALL STYLESHEETS ...
#d('head').find('link').filter(lambda i: pq(this).attr('rel') == 'stylesheet').remove()
d('head').find('link').remove()
### ORIGINAL COMMENT: Remove all style tags in head (not doing this on IE) :
d('head').find('style').remove()
d('head').find('script').remove()
### ORIGINAL COMMENT: Removes all style attributes
topdiv.find("*").attr('style', 'void:none;')
topdiv.find("*").removeAttr('style')
### ORIGINAL COMMENT: Goes in and removes DIV's that have more non <p> stuff than <p> stuff
topdiv.find('div').each(killdivs)
topdiv.find('script').remove()
d.find('div').each(killdivs)
d.find('script').remove()
### ORIGINAL COMMENT: Removes any consecutive <br />'s into just one <br />
#hh = multibreakre.sub(topdiv.html(), '<br />')
hh = multibreakre.sub('<br />', topdiv.html())
topdiv = pq("<div></div>").html(hh)
### ORIGINAL COMMENT: Cleans out junk from the topDiv just in case:
topdiv.find('form').each(cleanit)
topdiv.find('object').each(cleanit)
topdiv.find('table').each(cleanit)
topdiv.find('h1').each(cleanit)
topdiv.find('h2').each(cleanit)
topdiv.find('iframe').each(cleanit)
d.find('form').each(cleanit)
d.find('object').each(cleanit)
d.find('table').each(cleanit)
d.find('h1').each(cleanit)
d.find('h2').each(cleanit)
d.find('iframe').each(cleanit)
### ORIGINAL COMMENT: Add the footer and contents:
return (topdiv, d)
## function for each call to assign score
def calculatescore(dompart, itsmyfirsttime):
### ORIGINAL COMMENT: Initialize readability data
ppart = dompart.getparent()
pscore = 0
if itsmyfirsttime:
### ORIGINAL COMMENT: Look for a special classname
thecls = pq(ppart).attr('class')
theid = pq(ppart).attr('id')
if (thecls != None):
if (badnamere.search(thecls) != None):
pscore -= 50
elif (goodclsre.search(thecls) != None):
pscore += 25
### ORIGINAL COMMENT: Look for a special ID
if (theid != None):
if (badnamere.search(pq(ppart).attr('id')) != None):
pscore -= 50
elif (goodidre.search(pq(ppart).attr('id')) != None):
pscore += 25
### ORIGINAL COMMENT: Add a point for the paragraph found
if len(pq(dompart).text()) > 10:
pscore += 1
### ORIGINAL COMMENT: Add points for any commas within this paragraph
pscore += getcharcount(dompart)
return pscore
def assignscore(idx, dompart):
global thisURL
ppart = dompart.getparent()
scoreboard = roster[thisURL]
virginscore = (ppart not in scoreboard)
iscored = calculatescore(dompart, virginscore)
if virginscore:
scoreboard[ppart] = iscored
else:
scoreboard[ppart] += iscored
### ORIGINAL COMMENT: Get character count
def getcharcount(dompart, *args):
metric = ","
if len(args) > 1:
metric = str(args[0])
return pq(dompart).text().count(metric)
def killdivs(thing):
### ORIGINAL COMMENT: If the number of commas is less than 10 (bad sign) ...
if getcharcount(thing) < 10:
dp = thing
ip = len(dp.find('p'))
iimg = len(dp.find('img'))
ili = len(dp.find('li'))
ia = len(dp.find('a'))
iembed = len(dp.find('embed'))
if (iimg > ip or ili > ip or ia > ip or ip == 0 or iembed > 0):
thing.remove()
def cleanit(thing):
### ORIGINAL COMMENT: If the text content isn't laden with words, remove the child:
minwords = 10000000
if thing.is_("table"):
minwords = 250
if getcharcount(thing, " ") < minwords:
thing.remove()
def readability(rdiv, rcontext, *args, **kwargs):
toolshtml = """
<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>
<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>
<a href='#' onclick='emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"""
readheadhtml = """
<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability.css' media='screen' type='text/css' />
<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability-print.css' media='print' type='text/css' />"""
readfooterhtml = """
<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png' /></a>
<div class='footer-right'><span class='version'>Readability version readabilityVersion</span></div>"""
readstyle = 'style-newspaper'
readsize = 'size-large'
readmargin = 'margin-wide'
overlay = pq('<div id="readOverlay"></div>')
innerdiv = pq('<div id="readInner"></div>')
articletools = pq('<div id="readTools"></div>')
articlecontent = pq('<div></div>')
articletitle = pq('<h1>'+unicode(rcontext('title').text()).encode("utf-8")+'</h1>')
articlefooter = pq('<div id="readFooter"></div>')
### ORIGINAL COMMENT: Grab the title from the <title> tag and inject it as the title.
articlefooter.html(readfooterhtml)
articletools.html(toolshtml)
articlecontent.append((rdiv))
innerdiv.addClass(readmargin).addClass(readsize)
innerdiv.append((articletitle))
innerdiv.append((articlecontent))
overlay.addClass(readstyle)
if len(rcontext.find('body')) < 1:
rcontext.append(pq('<body></body>'))
rcontext.find('body').empty()
rcontext.find('head').eq(0).append((readheadhtml))
rcontext.find('body').eq(0).append((innerdiv))
rcontext.find('body').eq(0).append((articletools))
rcontext.find('body').eq(0).addClass(readstyle)
rcontext.find('body').eq(0).append((articlefooter))
if (kwargs['theurlbase'] != None):
tbb = kwargs['theurlbase']
rcontext.make_links_absolute(base_url=tbb)
rcontext.find('img').each(lambda aimg: aimg.attr('src').startswith('http://') and (1) or (aimg.attr('src', urlparse.urljoin(tbb, aimg.attr('src')))))
return rcontext
def txtfile(where, what):
if os.path.exists(os.path.split(where)[0]):
print "Writing file %s..." % where
fff = open(where, 'w')
fff.write(unicode(what).encode("utf-8"))
fff.close()
return True
else:
return False
def echousage():
print "\n"
print "readability.py 0.1. (c) 2009 Fish. All rights reserved."
print "\thttp://objectsinspaceandtime.com/"
print "Adapted from the Readability JavaScript bookmarklet by Arc90 Labs:"
print "\thttp://lab.arc90.com/2009/03/readability.php"
print "Usage:"
print "%s [-wothpv] url [url, url ...]" % __file__
print "\t-w\t--wtf\t\tPrint this message"
print "\t-o DIR\t--output=DIR\tSpecify a path for the output files"
print "\t-t\t--text\t\tSave text extract as a text file"
print "\t-h\t--html\t\tSave text extract as an HTML file"
print "\t-p\t--page\t\tSave text extract within a Readability(tm) HTML page"
print "\t-v\t--view\t\tView HTML files in the browser after generation"
print "\n"
if __name__ == '__main__':
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment