andreypopp/gist:2820220

## gistfile1.py
#!/usr/bin/env python
"""usage: bte file

arguments:

    file            filename or - to read from stdin

options:

    -h, --help      show this message and exit

Body text extraction implementation. Originally based on the idea of finding the
continous block of tags with best text to tag ratio. See
http://www.aidanf.net/posts/bte_gets_an_update.html for the idea.
"""

from __future__ import division
from sys import stdin
from docopt import docopt
from html5lib import HTMLParser, treewalkers, treebuilders

def main(opts, args):

    if args.file == "-":
        fd = stdin
    else:
        fd = open(args.file, "r")

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    doc = p.parse(fd)
    walker = treewalkers.getTreeWalker("dom")

    tokens = []
    bintokens = []

    waitfor = None

    for tok in walker(doc):

        if waitfor:
            if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
                waitfor = None
            continue

        if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
            waitfor = ("EndTag", tok["name"])

        if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
            bintokens.append(1)
            tokens.append(tok)

        elif  tok["type"] in ("Characters",):
            for tok in tok["data"].split():
                bintokens.append(0)
                tokens.append({"type": "Characters", "data": tok})

        elif  tok["type"] in ("SpaceCharacters", "Doctype"):
            pass

        else:
            raise ValueError("unrecognizable token type: %r" % tok)

    cumbintokens = [bintokens[0]]

    for tok in bintokens[1:]:
        cumbintokens.append(cumbintokens[-1] + tok)

    length = len(cumbintokens)

    midx = None
    m = None

    for i in range(length):
        for j in range(i + 1, length):

            tags_after = cumbintokens[-1] - cumbintokens[j]
            tags_before = cumbintokens[i]
            text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
            nm = tags_after + tags_before + text_between

            if not midx or nm > m:
                midx = i, j
                m = nm

    i, j = midx
    print serialize_tokens(tokens[i:j + 1])

def serialize_tokens(tokens):
    return " ".join(x.get("data") for x in tokens if x["type"] == "Characters")

if __name__ == "__main__":
    opts, args = docopt(__doc__)
    exit(main(opts, args))
	#!/usr/bin/env python
	"""usage: bte file

	arguments:

	file filename or - to read from stdin

	options:

	-h, --help show this message and exit

	Body text extraction implementation. Originally based on the idea of finding the
	continous block of tags with best text to tag ratio. See
	http://www.aidanf.net/posts/bte_gets_an_update.html for the idea.
	"""

	from __future__ import division
	from sys import stdin
	from docopt import docopt
	from html5lib import HTMLParser, treewalkers, treebuilders

	def main(opts, args):

	if args.file == "-":
	fd = stdin
	else:
	fd = open(args.file, "r")

	p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
	doc = p.parse(fd)
	walker = treewalkers.getTreeWalker("dom")

	tokens = []
	bintokens = []

	waitfor = None

	for tok in walker(doc):

	if waitfor:
	if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
	waitfor = None
	continue

	if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
	waitfor = ("EndTag", tok["name"])

	if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
	bintokens.append(1)
	tokens.append(tok)

	elif tok["type"] in ("Characters",):
	for tok in tok["data"].split():
	bintokens.append(0)
	tokens.append({"type": "Characters", "data": tok})

	elif tok["type"] in ("SpaceCharacters", "Doctype"):
	pass

	else:
	raise ValueError("unrecognizable token type: %r" % tok)

	cumbintokens = [bintokens[0]]

	for tok in bintokens[1:]:
	cumbintokens.append(cumbintokens[-1] + tok)

	length = len(cumbintokens)

	midx = None
	m = None

	for i in range(length):
	for j in range(i + 1, length):

	tags_after = cumbintokens[-1] - cumbintokens[j]
	tags_before = cumbintokens[i]
	text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
	nm = tags_after + tags_before + text_between

	if not midx or nm > m:
	midx = i, j
	m = nm

	i, j = midx
	print serialize_tokens(tokens[i:j + 1])

	def serialize_tokens(tokens):
	return " ".join(x.get("data") for x in tokens if x["type"] == "Characters")

	if __name__ == "__main__":
	opts, args = docopt(__doc__)
	exit(main(opts, args))