derrickturk/treesoup.py

## treesoup.py
import requests
import sys
from bs4 import BeautifulSoup # type: ignore
from bs4.element import Tag, NavigableString # type: ignore

from typing import Callable, Dict, List

def retrieve(url: str) -> BeautifulSoup:
    rq = requests.get(url)
    rq.encoding = 'utf-8'

    if len(rq.text) == 0:
        raise ValueError('no text')

    return BeautifulSoup(rq.text, 'html.parser')

def walk_tree(soup: BeautifulSoup) -> str:
    result = ''
    for c in soup.children:
        if isinstance(c, Tag):
            tag_ty = c.name
            folder = walk_tree._DISPATCH_TABLE.get(tag_ty)
            if folder is not None:
                result += folder(c)
            else:
                sys.stderr.write(f'warning: using default folder for {tag_ty}\n')
                result += walk_tree(c).strip()
        elif isinstance(c, NavigableString):
            result += c.strip()
        else:
            result += f'hmmm - have {c} of type {type(c)}'
    return result

def walk_div(div: Tag) -> str:
    return walk_tree(div) + '\n'

def walk_p(p: Tag) -> str:
    return '\t' + walk_tree(p).strip() + '\n'

def walk_a(a: Tag) -> str:
    return ' ' + walk_tree(a) + ' '

def walk_code(c: Tag) -> str:
    return '`' + walk_tree(c) + '`'

def walk_header(h: Tag) -> str:
    return f'= {walk_tree(h).strip()} =\n'

def walk_hr(_hr: Tag) -> str:
    return '-' * 80 + '\n'

def walk_br(_br: Tag) -> str:
    return '\n'

def walk_quote(quote: Tag) -> str:
    return f'"{walk_tree(quote).strip()}"'

def walk_list(l: Tag) -> str:
    # TODO: ul vs ol
    print(l.type)
    print(list(l.children))
    return '\n'.join('\t' + walk_tree(c) for c in l.children if isinstance(c, Tag))

def walk_li(li: Tag) -> str:
    return '- ' + walk_tree(li)

def ignore(_: Tag) -> str:
    return ''

walk_tree._DISPATCH_TABLE: Dict[str, Callable[[BeautifulSoup], str]] = {
    'div': walk_div,
    'p': walk_p,
    'a': walk_a,
    'script': ignore,
    'h1': walk_header,
    'h2': walk_header,
    'h3': walk_header,
    'h4': walk_header,
    'h5': walk_header,
    'h6': walk_header,
    'hr': walk_hr,
    'br': walk_br,
    'blockquote': walk_quote,
    'code': walk_code,
    'comment': ignore,
    'style': ignore,
    'noscript': ignore,
    'link': walk_a,
    'span': walk_tree,
    'ol': walk_list,
    'ul': walk_list,
    'li': walk_li,
}

def main(argv: List[str]) -> int:
    if len(argv) != 2:
        print(f'Usage: {argv[0]} url', file=sys.stderr)
        return 0

    soup = retrieve(argv[1])
    print(walk_tree(soup.body))

    return 0

if __name__ == '__main__':
    sys.exit(main(sys.argv))
	import requests
	import sys
	from bs4 import BeautifulSoup # type: ignore
	from bs4.element import Tag, NavigableString # type: ignore

	from typing import Callable, Dict, List

	def retrieve(url: str) -> BeautifulSoup:
	rq = requests.get(url)
	rq.encoding = 'utf-8'

	if len(rq.text) == 0:
	raise ValueError('no text')

	return BeautifulSoup(rq.text, 'html.parser')

	def walk_tree(soup: BeautifulSoup) -> str:
	result = ''
	for c in soup.children:
	if isinstance(c, Tag):
	tag_ty = c.name
	folder = walk_tree._DISPATCH_TABLE.get(tag_ty)
	if folder is not None:
	result += folder(c)
	else:
	sys.stderr.write(f'warning: using default folder for {tag_ty}\n')
	result += walk_tree(c).strip()
	elif isinstance(c, NavigableString):
	result += c.strip()
	else:
	result += f'hmmm - have {c} of type {type(c)}'
	return result

	def walk_div(div: Tag) -> str:
	return walk_tree(div) + '\n'

	def walk_p(p: Tag) -> str:
	return '\t' + walk_tree(p).strip() + '\n'

	def walk_a(a: Tag) -> str:
	return ' ' + walk_tree(a) + ' '

	def walk_code(c: Tag) -> str:
	return '`' + walk_tree(c) + '`'

	def walk_header(h: Tag) -> str:
	return f'= {walk_tree(h).strip()} =\n'

	def walk_hr(_hr: Tag) -> str:
	return '-' * 80 + '\n'

	def walk_br(_br: Tag) -> str:
	return '\n'

	def walk_quote(quote: Tag) -> str:
	return f'"{walk_tree(quote).strip()}"'

	def walk_list(l: Tag) -> str:
	# TODO: ul vs ol
	print(l.type)
	print(list(l.children))
	return '\n'.join('\t' + walk_tree(c) for c in l.children if isinstance(c, Tag))

	def walk_li(li: Tag) -> str:
	return '- ' + walk_tree(li)

	def ignore(_: Tag) -> str:
	return ''

	walk_tree._DISPATCH_TABLE: Dict[str, Callable[[BeautifulSoup], str]] = {
	'div': walk_div,
	'p': walk_p,
	'a': walk_a,
	'script': ignore,
	'h1': walk_header,
	'h2': walk_header,
	'h3': walk_header,
	'h4': walk_header,
	'h5': walk_header,
	'h6': walk_header,
	'hr': walk_hr,
	'br': walk_br,
	'blockquote': walk_quote,
	'code': walk_code,
	'comment': ignore,
	'style': ignore,
	'noscript': ignore,
	'link': walk_a,
	'span': walk_tree,
	'ol': walk_list,
	'ul': walk_list,
	'li': walk_li,
	}

	def main(argv: List[str]) -> int:
	if len(argv) != 2:
	print(f'Usage: {argv[0]} url', file=sys.stderr)
	return 0

	soup = retrieve(argv[1])
	print(walk_tree(soup.body))

	return 0

	if __name__ == '__main__':
	sys.exit(main(sys.argv))