Last active
November 19, 2020 22:03
-
-
Save derrickturk/1aebc01f5d609087d7d5f4ee3a28e46e to your computer and use it in GitHub Desktop.
walking the tree of soup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import sys | |
from bs4 import BeautifulSoup # type: ignore | |
from bs4.element import Tag, NavigableString # type: ignore | |
from typing import Callable, Dict, List | |
def retrieve(url: str) -> BeautifulSoup: | |
rq = requests.get(url) | |
rq.encoding = 'utf-8' | |
if len(rq.text) == 0: | |
raise ValueError('no text') | |
return BeautifulSoup(rq.text, 'html.parser') | |
def walk_tree(soup: BeautifulSoup) -> str: | |
result = '' | |
for c in soup.children: | |
if isinstance(c, Tag): | |
tag_ty = c.name | |
folder = walk_tree._DISPATCH_TABLE.get(tag_ty) | |
if folder is not None: | |
result += folder(c) | |
else: | |
sys.stderr.write(f'warning: using default folder for {tag_ty}\n') | |
result += walk_tree(c).strip() | |
elif isinstance(c, NavigableString): | |
result += c.strip() | |
else: | |
result += f'hmmm - have {c} of type {type(c)}' | |
return result | |
def walk_div(div: Tag) -> str: | |
return walk_tree(div) + '\n' | |
def walk_p(p: Tag) -> str: | |
return '\t' + walk_tree(p).strip() + '\n' | |
def walk_a(a: Tag) -> str: | |
return ' ' + walk_tree(a) + ' ' | |
def walk_code(c: Tag) -> str: | |
return '`' + walk_tree(c) + '`' | |
def walk_header(h: Tag) -> str: | |
return f'= {walk_tree(h).strip()} =\n' | |
def walk_hr(_hr: Tag) -> str: | |
return '-' * 80 + '\n' | |
def walk_br(_br: Tag) -> str: | |
return '\n' | |
def walk_quote(quote: Tag) -> str: | |
return f'"{walk_tree(quote).strip()}"' | |
def walk_list(l: Tag) -> str: | |
# TODO: ul vs ol | |
print(l.type) | |
print(list(l.children)) | |
return '\n'.join('\t' + walk_tree(c) for c in l.children if isinstance(c, Tag)) | |
def walk_li(li: Tag) -> str: | |
return '- ' + walk_tree(li) | |
def ignore(_: Tag) -> str: | |
return '' | |
walk_tree._DISPATCH_TABLE: Dict[str, Callable[[BeautifulSoup], str]] = { | |
'div': walk_div, | |
'p': walk_p, | |
'a': walk_a, | |
'script': ignore, | |
'h1': walk_header, | |
'h2': walk_header, | |
'h3': walk_header, | |
'h4': walk_header, | |
'h5': walk_header, | |
'h6': walk_header, | |
'hr': walk_hr, | |
'br': walk_br, | |
'blockquote': walk_quote, | |
'code': walk_code, | |
'comment': ignore, | |
'style': ignore, | |
'noscript': ignore, | |
'link': walk_a, | |
'span': walk_tree, | |
'ol': walk_list, | |
'ul': walk_list, | |
'li': walk_li, | |
} | |
def main(argv: List[str]) -> int: | |
if len(argv) != 2: | |
print(f'Usage: {argv[0]} url', file=sys.stderr) | |
return 0 | |
soup = retrieve(argv[1]) | |
print(walk_tree(soup.body)) | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(main(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment