leifdenby/ecmwf-event-parser.py

## ecmwf-event-parser.py
import requests
import bs4
import dateutil.parser
import pandas as pd

def parse_event_details(url):
    """
    Parse event details from ECMWF event page and return title, timings,
    speaker name, speaker affiliation, talk pdf and video URLs as a
    pandas.DataFrame

    An event page URL example could be https://events.ecmwf.int/event/304/timetable/
    """
    req = requests.get(url)
    soup = bs4.BeautifulSoup(req.text)

    table_el = soup.find("table")

    data = []
    skip_rows = ["day-header", "non-empty-session"]
    for i, table_row in enumerate(table_el.find_all("tr")):
        if "class" in table_row.attrs and any(
            [sr in table_row.attrs["class"] for sr in skip_rows]
        ):
            continue

        entry_data = {}

        time_col = table_row.find(name="td", attrs={"class": "time"})
        entry_data["start_time"] = dateutil.parser.parse(
            time_col.find(
                "span", attrs={"class": ["timetable-time", "start-time"]}
            ).attrs["data-time"]
        )
        entry_data["end_time"] = dateutil.parser.parse(
            time_col.find(
                "span", attrs={"class": ["timetable-time", "end-time"]}
            ).attrs["data-time"]
        )

        entry_data["title"] = table_row.find(
            "div", attrs={"class": "title"}
        ).text.strip()

        el_speaker_list = table_row.find("div", attrs={"class": "speaker-list"})
        if el_speaker_list is not None:
            entry_data["speaker_name"] = (
                el_speaker_list.find("span", attrs={"class": ""}).find("span").text
            )
            entry_data["speaker_affiliation"] = el_speaker_list.find(
                "span", attrs={"class": "affiliation"}
            ).text.strip()[1:-1]

            url_selectors = dict(
                talk_pdf_url="icon-file-pdf",
                talk_video_url="icon-link",
            )

            for item_name, item_class in url_selectors.items():
                el_url = table_row.find("a", attrs={"class": item_class})
                if el_url is not None:
                    url = el_url.attrs["href"]
                    if not url.startswith("http"):
                        url = f"https://events.ecmwf.int{url}"
                    entry_data[item_name] = url

            data.append(entry_data)

    return pd.DataFrame(data)
	import requests
	import bs4
	import dateutil.parser
	import pandas as pd

	def parse_event_details(url):
	"""
	Parse event details from ECMWF event page and return title, timings,
	speaker name, speaker affiliation, talk pdf and video URLs as a
	pandas.DataFrame

	An event page URL example could be https://events.ecmwf.int/event/304/timetable/
	"""
	req = requests.get(url)
	soup = bs4.BeautifulSoup(req.text)

	table_el = soup.find("table")

	data = []
	skip_rows = ["day-header", "non-empty-session"]
	for i, table_row in enumerate(table_el.find_all("tr")):
	if "class" in table_row.attrs and any(
	[sr in table_row.attrs["class"] for sr in skip_rows]
	):
	continue

	entry_data = {}

	time_col = table_row.find(name="td", attrs={"class": "time"})
	entry_data["start_time"] = dateutil.parser.parse(
	time_col.find(
	"span", attrs={"class": ["timetable-time", "start-time"]}
	).attrs["data-time"]
	)
	entry_data["end_time"] = dateutil.parser.parse(
	time_col.find(
	"span", attrs={"class": ["timetable-time", "end-time"]}
	).attrs["data-time"]
	)

	entry_data["title"] = table_row.find(
	"div", attrs={"class": "title"}
	).text.strip()

	el_speaker_list = table_row.find("div", attrs={"class": "speaker-list"})
	if el_speaker_list is not None:
	entry_data["speaker_name"] = (
	el_speaker_list.find("span", attrs={"class": ""}).find("span").text
	)
	entry_data["speaker_affiliation"] = el_speaker_list.find(
	"span", attrs={"class": "affiliation"}
	).text.strip()[1:-1]

	url_selectors = dict(
	talk_pdf_url="icon-file-pdf",
	talk_video_url="icon-link",
	)

	for item_name, item_class in url_selectors.items():
	el_url = table_row.find("a", attrs={"class": item_class})
	if el_url is not None:
	url = el_url.attrs["href"]
	if not url.startswith("http"):
	url = f"https://events.ecmwf.int{url}"
	entry_data[item_name] = url

	data.append(entry_data)

	return pd.DataFrame(data)