Skip to content

Instantly share code, notes, and snippets.

@leifdenby
Last active December 1, 2022 09:55
Show Gist options
  • Save leifdenby/c5a4e5f8c147d13863c02ba6e0438035 to your computer and use it in GitHub Desktop.
Save leifdenby/c5a4e5f8c147d13863c02ba6e0438035 to your computer and use it in GitHub Desktop.
Parser for ECWMF event pages
import requests
import bs4
import dateutil.parser
import pandas as pd
def parse_event_details(url):
"""
Parse event details from ECMWF event page and return title, timings,
speaker name, speaker affiliation, talk pdf and video URLs as a
pandas.DataFrame
An event page URL example could be https://events.ecmwf.int/event/304/timetable/
"""
req = requests.get(url)
soup = bs4.BeautifulSoup(req.text)
table_el = soup.find("table")
data = []
skip_rows = ["day-header", "non-empty-session"]
for i, table_row in enumerate(table_el.find_all("tr")):
if "class" in table_row.attrs and any(
[sr in table_row.attrs["class"] for sr in skip_rows]
):
continue
entry_data = {}
time_col = table_row.find(name="td", attrs={"class": "time"})
entry_data["start_time"] = dateutil.parser.parse(
time_col.find(
"span", attrs={"class": ["timetable-time", "start-time"]}
).attrs["data-time"]
)
entry_data["end_time"] = dateutil.parser.parse(
time_col.find(
"span", attrs={"class": ["timetable-time", "end-time"]}
).attrs["data-time"]
)
entry_data["title"] = table_row.find(
"div", attrs={"class": "title"}
).text.strip()
el_speaker_list = table_row.find("div", attrs={"class": "speaker-list"})
if el_speaker_list is not None:
entry_data["speaker_name"] = (
el_speaker_list.find("span", attrs={"class": ""}).find("span").text
)
entry_data["speaker_affiliation"] = el_speaker_list.find(
"span", attrs={"class": "affiliation"}
).text.strip()[1:-1]
url_selectors = dict(
talk_pdf_url="icon-file-pdf",
talk_video_url="icon-link",
)
for item_name, item_class in url_selectors.items():
el_url = table_row.find("a", attrs={"class": item_class})
if el_url is not None:
url = el_url.attrs["href"]
if not url.startswith("http"):
url = f"https://events.ecmwf.int{url}"
entry_data[item_name] = url
data.append(entry_data)
return pd.DataFrame(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment