Created
May 28, 2012 17:38
-
-
Save andreypopp/2820220 to your computer and use it in GitHub Desktop.
text-to-tags ratio based content extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""usage: bte file | |
arguments: | |
file filename or - to read from stdin | |
options: | |
-h, --help show this message and exit | |
Body text extraction implementation. Originally based on the idea of finding the | |
continous block of tags with best text to tag ratio. See | |
http://www.aidanf.net/posts/bte_gets_an_update.html for the idea. | |
""" | |
from __future__ import division | |
from sys import stdin | |
from docopt import docopt | |
from html5lib import HTMLParser, treewalkers, treebuilders | |
def main(opts, args): | |
if args.file == "-": | |
fd = stdin | |
else: | |
fd = open(args.file, "r") | |
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) | |
doc = p.parse(fd) | |
walker = treewalkers.getTreeWalker("dom") | |
tokens = [] | |
bintokens = [] | |
waitfor = None | |
for tok in walker(doc): | |
if waitfor: | |
if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]: | |
waitfor = None | |
continue | |
if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"): | |
waitfor = ("EndTag", tok["name"]) | |
if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"): | |
bintokens.append(1) | |
tokens.append(tok) | |
elif tok["type"] in ("Characters",): | |
for tok in tok["data"].split(): | |
bintokens.append(0) | |
tokens.append({"type": "Characters", "data": tok}) | |
elif tok["type"] in ("SpaceCharacters", "Doctype"): | |
pass | |
else: | |
raise ValueError("unrecognizable token type: %r" % tok) | |
cumbintokens = [bintokens[0]] | |
for tok in bintokens[1:]: | |
cumbintokens.append(cumbintokens[-1] + tok) | |
length = len(cumbintokens) | |
midx = None | |
m = None | |
for i in range(length): | |
for j in range(i + 1, length): | |
tags_after = cumbintokens[-1] - cumbintokens[j] | |
tags_before = cumbintokens[i] | |
text_between = (j - i) - (cumbintokens[j] - cumbintokens[i]) | |
nm = tags_after + tags_before + text_between | |
if not midx or nm > m: | |
midx = i, j | |
m = nm | |
i, j = midx | |
print serialize_tokens(tokens[i:j + 1]) | |
def serialize_tokens(tokens): | |
return " ".join(x.get("data") for x in tokens if x["type"] == "Characters") | |
if __name__ == "__main__": | |
opts, args = docopt(__doc__) | |
exit(main(opts, args)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment