Skip to content

Instantly share code, notes, and snippets.

@derrickturk
Last active November 19, 2020 22:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save derrickturk/1aebc01f5d609087d7d5f4ee3a28e46e to your computer and use it in GitHub Desktop.
Save derrickturk/1aebc01f5d609087d7d5f4ee3a28e46e to your computer and use it in GitHub Desktop.
walking the tree of soup
import requests
import sys
from bs4 import BeautifulSoup # type: ignore
from bs4.element import Tag, NavigableString # type: ignore
from typing import Callable, Dict, List
def retrieve(url: str) -> BeautifulSoup:
rq = requests.get(url)
rq.encoding = 'utf-8'
if len(rq.text) == 0:
raise ValueError('no text')
return BeautifulSoup(rq.text, 'html.parser')
def walk_tree(soup: BeautifulSoup) -> str:
result = ''
for c in soup.children:
if isinstance(c, Tag):
tag_ty = c.name
folder = walk_tree._DISPATCH_TABLE.get(tag_ty)
if folder is not None:
result += folder(c)
else:
sys.stderr.write(f'warning: using default folder for {tag_ty}\n')
result += walk_tree(c).strip()
elif isinstance(c, NavigableString):
result += c.strip()
else:
result += f'hmmm - have {c} of type {type(c)}'
return result
def walk_div(div: Tag) -> str:
return walk_tree(div) + '\n'
def walk_p(p: Tag) -> str:
return '\t' + walk_tree(p).strip() + '\n'
def walk_a(a: Tag) -> str:
return ' ' + walk_tree(a) + ' '
def walk_code(c: Tag) -> str:
return '`' + walk_tree(c) + '`'
def walk_header(h: Tag) -> str:
return f'= {walk_tree(h).strip()} =\n'
def walk_hr(_hr: Tag) -> str:
return '-' * 80 + '\n'
def walk_br(_br: Tag) -> str:
return '\n'
def walk_quote(quote: Tag) -> str:
return f'"{walk_tree(quote).strip()}"'
def walk_list(l: Tag) -> str:
# TODO: ul vs ol
print(l.type)
print(list(l.children))
return '\n'.join('\t' + walk_tree(c) for c in l.children if isinstance(c, Tag))
def walk_li(li: Tag) -> str:
return '- ' + walk_tree(li)
def ignore(_: Tag) -> str:
return ''
walk_tree._DISPATCH_TABLE: Dict[str, Callable[[BeautifulSoup], str]] = {
'div': walk_div,
'p': walk_p,
'a': walk_a,
'script': ignore,
'h1': walk_header,
'h2': walk_header,
'h3': walk_header,
'h4': walk_header,
'h5': walk_header,
'h6': walk_header,
'hr': walk_hr,
'br': walk_br,
'blockquote': walk_quote,
'code': walk_code,
'comment': ignore,
'style': ignore,
'noscript': ignore,
'link': walk_a,
'span': walk_tree,
'ol': walk_list,
'ul': walk_list,
'li': walk_li,
}
def main(argv: List[str]) -> int:
if len(argv) != 2:
print(f'Usage: {argv[0]} url', file=sys.stderr)
return 0
soup = retrieve(argv[1])
print(walk_tree(soup.body))
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment