Skip to content

Instantly share code, notes, and snippets.

@tonyfast
Created March 12, 2020 15:46
Show Gist options
  • Save tonyfast/73a7043424611bdfd559c4eab5030378 to your computer and use it in GitHub Desktop.
Save tonyfast/73a7043424611bdfd559c4eab5030378 to your computer and use it in GitHub Desktop.
import xml.etree, pandas as 🐼, bs4 as 🍲
url=\

https://bl.ocks.org/tonyfast/2947b4bb582e193f5b2a7dbf8b009b62

__import__('requests_cache').install_cache('signal')
import requests
response = requests.get(url)
tree = 🍲.BeautifulSoup(response.content, 'xml')


def ravel(soup, level=0):
    if isinstance(soup, 🍲.element.Tag):
        yield soup, level
    
    for child in getattr(soup, 'children', []):
        yield from ravel(child, level+1)
        
def tidysoup(soup):
     return 🐼.DataFrame(ravel(tree), columns='tag level'.split()).reset_index().set_index('index level'.split()).tag.apply(
            lambda x: 🐼.Series({**x.attrs, 'tag': x.name, 'text': x.text})
        )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment