romanbsd/table_extract.py

## table_extract.py
from typing import List, Union

from bs4 import BeautifulSoup
from bs4.element import Tag


class TableExtract:
    def __init__(self, headers: List[str], keep_html=False) -> None:
        self._headers = headers
        self._keep_html = keep_html

    def parse(self, markup):
        soup = BeautifulSoup(markup, "html.parser")
        trs = soup.select("tr:first-child")
        headers = set(self._headers)
        tr = next((tr for tr in trs if set(self._parse_tr(tr)) == headers), None)
        if not tr:
            return []
        headers = self._parse_tr(tr)
        indeces = [headers.index(h) for h in self._headers]
        values_at = lambda x: [x[i] for i in indeces]

        return [
            values_at(self._parse_tr(t, self._keep_html))
            for t in tr.next_siblings
            if isinstance(t, Tag)
        ]

    def _parse_tr(self, tr: Tag, keep_html=False) -> List[Union[str, Tag]]:
        if keep_html:
            return [td for td in tr.children if isinstance(td, Tag)]
        else:
            return [td.text.strip() for td in tr.children if isinstance(td, Tag)]


if __name__ == "__main__":
    import sys

    with open(sys.argv[1]) as f:
        html = f.read()

    headers = sys.argv[2].split(",")
    te = TableExtract(headers, keep_html=True)
    print(te.parse(html))
	from typing import List, Union

	from bs4 import BeautifulSoup
	from bs4.element import Tag


	class TableExtract:
	def __init__(self, headers: List[str], keep_html=False) -> None:
	self._headers = headers
	self._keep_html = keep_html

	def parse(self, markup):
	soup = BeautifulSoup(markup, "html.parser")
	trs = soup.select("tr:first-child")
	headers = set(self._headers)
	tr = next((tr for tr in trs if set(self._parse_tr(tr)) == headers), None)
	if not tr:
	return []
	headers = self._parse_tr(tr)
	indeces = [headers.index(h) for h in self._headers]
	values_at = lambda x: [x[i] for i in indeces]

	return [
	values_at(self._parse_tr(t, self._keep_html))
	for t in tr.next_siblings
	if isinstance(t, Tag)
	]

	def _parse_tr(self, tr: Tag, keep_html=False) -> List[Union[str, Tag]]:
	if keep_html:
	return [td for td in tr.children if isinstance(td, Tag)]
	else:
	return [td.text.strip() for td in tr.children if isinstance(td, Tag)]


	if __name__ == "__main__":
	import sys

	with open(sys.argv[1]) as f:
	html = f.read()

	headers = sys.argv[2].split(",")
	te = TableExtract(headers, keep_html=True)
	print(te.parse(html))