Skip to content

Instantly share code, notes, and snippets.

Created May 28, 2012 17:38
Show Gist options
  • Save andreypopp/2820220 to your computer and use it in GitHub Desktop.
Save andreypopp/2820220 to your computer and use it in GitHub Desktop.
text-to-tags ratio based content extractor
#!/usr/bin/env python
"""usage: bte file
file filename or - to read from stdin
-h, --help show this message and exit
Body text extraction implementation. Originally based on the idea of finding the
continous block of tags with best text to tag ratio. See for the idea.
from __future__ import division
from sys import stdin
from docopt import docopt
from html5lib import HTMLParser, treewalkers, treebuilders
def main(opts, args):
if args.file == "-":
fd = stdin
fd = open(args.file, "r")
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
doc = p.parse(fd)
walker = treewalkers.getTreeWalker("dom")
tokens = []
bintokens = []
waitfor = None
for tok in walker(doc):
if waitfor:
if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
waitfor = None
if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
waitfor = ("EndTag", tok["name"])
if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
elif tok["type"] in ("Characters",):
for tok in tok["data"].split():
tokens.append({"type": "Characters", "data": tok})
elif tok["type"] in ("SpaceCharacters", "Doctype"):
raise ValueError("unrecognizable token type: %r" % tok)
cumbintokens = [bintokens[0]]
for tok in bintokens[1:]:
cumbintokens.append(cumbintokens[-1] + tok)
length = len(cumbintokens)
midx = None
m = None
for i in range(length):
for j in range(i + 1, length):
tags_after = cumbintokens[-1] - cumbintokens[j]
tags_before = cumbintokens[i]
text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
nm = tags_after + tags_before + text_between
if not midx or nm > m:
midx = i, j
m = nm
i, j = midx
print serialize_tokens(tokens[i:j + 1])
def serialize_tokens(tokens):
return " ".join(x.get("data") for x in tokens if x["type"] == "Characters")
if __name__ == "__main__":
opts, args = docopt(__doc__)
exit(main(opts, args))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment