Skip to content

Instantly share code, notes, and snippets.

@ayhanfuat
Last active July 28, 2021 15:36
Show Gist options
  • Save ayhanfuat/945d64c69a4bdb5f4f60c58e143e3074 to your computer and use it in GitHub Desktop.
Save ayhanfuat/945d64c69a4bdb5f4f60c58e143e3074 to your computer and use it in GitHub Desktop.
Scrapes the data for EuroPython talks and writes to a CSV file to upload to a calendar
from pandas import DataFrame
from bs4 import BeautifulSoup
import requests
from calendar import month_abbr
import datetime
base = 'https://ep2021.europython.eu'
sessions_url = f'{base}/events/sessions/'
resp = requests.get(sessions_url)
soup = BeautifulSoup(resp.text, 'html.parser')
anchors = soup.select('#content ul li a', href=True)
talks = [a['href'] for a in anchors if a['href'].startswith('/talks/')]
# this talk doesn't seem to have a date assigned
talks.remove('/talks/BjExfG9-python-security-best-practises/')
res = []
month_map = {month: idx for idx, month in enumerate(month_abbr) if month}
for talk in talks:
resp = requests.get(f'{base}{talk}')
soup = BeautifulSoup(resp.text, 'html.parser')
title = soup.select('h1')[0].text
speaker = soup.select('h5')[0].text
date = soup.select('#talk_page .talk a.btn')[0].text
_, day_and_month, time_range = date.split(', ')
month, day = day_and_month.split()
month = month_map[month]
day = int(day)
year = 2021
time_start, *rest = time_range.partition('-')
hour, minute = time_start.split(':')
hour = int(hour)
minute = int(minute)
time_start = datetime.datetime(year, month, day, hour, minute)
details = soup.select('#talk_page p code')[0].text
talk_type, python_level, domain_level = details.split('; ')
duration = int(talk_type[talk_type.find('(') + 1:talk_type.find(' mins')])
duration = duration if duration != 180 else 180 + 15
td = datetime.timedelta(minutes=duration)
time_end = time_start + td
res.append({
'Subject': title,
'Start': time_start,
'End': time_end,
'Description': f'{speaker} \n{base}{talk}\n{details}'
})
df = DataFrame(res)
df['Start'] = df['Start'].dt.tz_localize('CET').dt.tz_convert('UTC')
df['End'] = df['End'].dt.tz_localize('CET').dt.tz_convert('UTC')
df['Start Date'] = df['Start'].dt.strftime('%Y-%m-%d')
df['Start Time'] = df['Start'].dt.strftime('%H:%M')
df['End Date'] = df['End'].dt.strftime('%Y-%m-%d')
df['End Time'] = df['End'].dt.strftime('%H:%M')
cols = [
'Subject',
'Start Date',
'Start Time',
'End Date',
'End Time',
'Description',
]
df[cols].to_csv('schedule.csv', index=False)