Skip to content

Instantly share code, notes, and snippets.

@kwellman
Created October 18, 2010 15:44
Show Gist options
  • Save kwellman/632442 to your computer and use it in GitHub Desktop.
Save kwellman/632442 to your computer and use it in GitHub Desktop.
lxml_readability.py
"""An lxml Port of Nirmal Patel's port (http://nirmalpatel.com/fcgi/hn.py) of
Arc90's Readability to Python.
"""
import re
from lxml.html import fromstring, tostring
from lxml.html.clean import Cleaner
NEGATIVE = re.compile('comment|meta|footer|footnote|foot')
POSITIVE = re.compile('post|hentry|entry|content|text|body|article')
BREAKS = re.compile(r'<br */? *>\s*<br */? *>')
def extract(link, html):
# convert text separated by breaks into individual paragraph elements
html = BREAKS.sub('</p><p>', html)
doc = fromstring(html)
cleaner = Cleaner(style=True)
# remove style, meta, page structure, script, form elements
doc = cleaner.clean_html(doc)
parents = []
topparent = None
# traverse paragraph elements
for d in doc.iter('p'):
parent = d.getparent()
if parent not in parents:
parents.append(parent)
parent.score = 0
# add/remove points based on class attribute
if 'class' in parent:
cls = parent.get('class')
if NEGATIVE.match(cls):
parent.score -= 50
if POSITIVE.match(cls):
parent.score += 25
# ditto for id attribute
if 'id' in parent:
id = parent.get('id')
if NEGATIVE.match(id):
parent.score -= 50
if POSITIVE.match(id):
parent.score += 25
text = d.text_content()
# add points for containing a significant amount of text
if len(text) > 10:
parent.score += 1
# add points for containing punctuation common in text content
parent.score += text.count(',')
# find highest scoring element
for parent in parents:
if topparent is None or parent.score > topparent.score:
topparent = parent
if topparent is not None:
if link:
topparent.make_links_absolute(link)
return topparent.text_content()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment