Skip to content

Instantly share code, notes, and snippets.

@andreypopp
Created May 28, 2012 17:38
Show Gist options
  • Save andreypopp/2820220 to your computer and use it in GitHub Desktop.
Save andreypopp/2820220 to your computer and use it in GitHub Desktop.
text-to-tags ratio based content extractor
#!/usr/bin/env python
"""usage: bte file
arguments:
file filename or - to read from stdin
options:
-h, --help show this message and exit
Body text extraction implementation. Originally based on the idea of finding the
continous block of tags with best text to tag ratio. See
http://www.aidanf.net/posts/bte_gets_an_update.html for the idea.
"""
from __future__ import division
from sys import stdin
from docopt import docopt
from html5lib import HTMLParser, treewalkers, treebuilders
def main(opts, args):
if args.file == "-":
fd = stdin
else:
fd = open(args.file, "r")
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
doc = p.parse(fd)
walker = treewalkers.getTreeWalker("dom")
tokens = []
bintokens = []
waitfor = None
for tok in walker(doc):
if waitfor:
if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
waitfor = None
continue
if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
waitfor = ("EndTag", tok["name"])
if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
bintokens.append(1)
tokens.append(tok)
elif tok["type"] in ("Characters",):
for tok in tok["data"].split():
bintokens.append(0)
tokens.append({"type": "Characters", "data": tok})
elif tok["type"] in ("SpaceCharacters", "Doctype"):
pass
else:
raise ValueError("unrecognizable token type: %r" % tok)
cumbintokens = [bintokens[0]]
for tok in bintokens[1:]:
cumbintokens.append(cumbintokens[-1] + tok)
length = len(cumbintokens)
midx = None
m = None
for i in range(length):
for j in range(i + 1, length):
tags_after = cumbintokens[-1] - cumbintokens[j]
tags_before = cumbintokens[i]
text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
nm = tags_after + tags_before + text_between
if not midx or nm > m:
midx = i, j
m = nm
i, j = midx
print serialize_tokens(tokens[i:j + 1])
def serialize_tokens(tokens):
return " ".join(x.get("data") for x in tokens if x["type"] == "Characters")
if __name__ == "__main__":
opts, args = docopt(__doc__)
exit(main(opts, args))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment