Skip to content

Instantly share code, notes, and snippets.

@romanbsd
Created October 31, 2021 08:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save romanbsd/bb35b9518abe09f8632c5846e3030001 to your computer and use it in GitHub Desktop.
Save romanbsd/bb35b9518abe09f8632c5846e3030001 to your computer and use it in GitHub Desktop.
Extract tables from HTML
from typing import List, Union
from bs4 import BeautifulSoup
from bs4.element import Tag
class TableExtract:
def __init__(self, headers: List[str], keep_html=False) -> None:
self._headers = headers
self._keep_html = keep_html
def parse(self, markup):
soup = BeautifulSoup(markup, "html.parser")
trs = soup.select("tr:first-child")
headers = set(self._headers)
tr = next((tr for tr in trs if set(self._parse_tr(tr)) == headers), None)
if not tr:
return []
headers = self._parse_tr(tr)
indeces = [headers.index(h) for h in self._headers]
values_at = lambda x: [x[i] for i in indeces]
return [
values_at(self._parse_tr(t, self._keep_html))
for t in tr.next_siblings
if isinstance(t, Tag)
]
def _parse_tr(self, tr: Tag, keep_html=False) -> List[Union[str, Tag]]:
if keep_html:
return [td for td in tr.children if isinstance(td, Tag)]
else:
return [td.text.strip() for td in tr.children if isinstance(td, Tag)]
if __name__ == "__main__":
import sys
with open(sys.argv[1]) as f:
html = f.read()
headers = sys.argv[2].split(",")
te = TableExtract(headers, keep_html=True)
print(te.parse(html))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment