Skip to content

Instantly share code, notes, and snippets.

@danecjensen
Created March 11, 2019 17:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danecjensen/1478f241394d51f8fcb1e6060003cb2d to your computer and use it in GitHub Desktop.
Save danecjensen/1478f241394d51f8fcb1e6060003cb2d to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import datadotworld as ddw
import re
import string
from pyparsing import anyOpenTag, anyCloseTag
from xml.sax.saxutils import unescape as unescape
unescape_xml_entities = lambda s: unescape(s, {"'": "'", """: '"', " ":" "})
stripper = (anyOpenTag | anyCloseTag).suppress()
def speaker_schedule():
df = pd.DataFrame(columns=['name', 'title_and_location', 'summary', 'date', 'event_time'])
index_start = 0
for letter in list(string.ascii_uppercase):
print(letter)
success_flag = False
for _ in range(3):
main_page = requests.get(f'https://schedule.sxsw.com/2019/speakers/alpha/{letter}.html')
if main_page.status_code == 200:
success_flag = True
break
if not success_flag:
print(f'{letter}s failed to load')
main_soup = BeautifulSoup(main_page.content, 'html.parser')
ind_events = main_soup.find_all('div', class_='row single-event')
for i,j in enumerate(ind_events):
i += index_start
url = 'https://schedule.sxsw.com' + re.search(r'(/2019/speakers/\d+)', str(j)).group()
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
summary = str(soup.find('div', class_='row speaker-bio'))
title_and_location = str(soup.find_all('div', class_='small-12 columns event-details'))
name = str(soup.find('h4'))
date = str(soup.find('div', class_='date'))
event_time = str(soup.find('div', class_='time'))
df.loc[i, :] = [name, title_and_location, summary, date, event_time]
time.sleep(0.1)
index_start += len(ind_events)
print(df.shape)
def extract_time(x):
t = re.sub(r'(<div class="time">)', '', x)
return re.sub(r'(</div>)', '', t)
def extract_title_name(x):
t = unescape_xml_entities(stripper.transformString(x))
t = t.strip('[]')
return t
# split title and location returning text
def location_split(text):
if isinstance(text, str) and ' at ' in text.lower():
text_split = text.split(' at ')
try:
return text_split[-1]
except:
return None
df['summary'] = df['summary'].apply(lambda x: extract_title_name(x))
df['date'] = df['date'].apply(lambda x: unescape_xml_entities(stripper.transformString(x)))
df['event_time'] = df['event_time'].apply(lambda x: extract_time(x))
df['name'] = df['name'].apply(lambda x: extract_title_name(x))
df['name'] = df['name'].apply(lambda x: x.replace('Events featuring ', ''))
df['title_and_location'] = df['title_and_location'].apply(lambda x: extract_title_name(x))
df['venue'] = df.title_and_location.apply(lambda x: location_split(x))
with ddw.open_remote_file('sparklesquad/sxsw-schedule', 'speaker_schedule_2019.csv', mode='w') as f:
df.to_csv(f, index=False)
def music_schedule():
df = pd.DataFrame(columns=['name', 'summary', 'genre', 'subgenre', 'home', 'audio', 'title_and_location',
'date', 'event_time'])
index_start = 0
for letter in list(string.ascii_uppercase):
print(letter)
success_flag = False
for _ in range(3):
main_page = requests.get(f'https://schedule.sxsw.com/2019/artists/alpha/{letter}.html')
if main_page.status_code == 200:
success_flag = True
break
if not success_flag:
print(f'{letter}s failed to load')
main_soup = BeautifulSoup(main_page.content, 'html.parser')
ind_events = main_soup.find_all('div', class_='row single-event')
for i, j in enumerate(ind_events):
i += index_start
url = 'https://schedule.sxsw.com' + re.search(r'(/2019/artists/\d+)', str(j)).group()
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
genre, subgenre, home = soup.select('div div div p div')
genre, subgenre, home = str(genre), str(subgenre), str(home)
except:
genre, subgenre, home = ['', '', '']
audio = str(soup.select('audio'))
title_and_location = str(soup.find_all('div', class_='small-12 columns event-details'))
name = str(soup.find_all('h1', class_='artist-name'))
date = str(soup.find('div', class_='date'))
event_time = str(soup.find('div', class_='time'))
summary = str(soup.find('div', class_='large-8 small-12 columns'))
df.loc[i, :] = [name, summary, genre, subgenre, home, audio, title_and_location,
date, event_time]
time.sleep(0.1)
index_start += len(ind_events)
print(df.shape)
def extract_genre(x):
t = unescape_xml_entities(stripper.transformString(x))
try:
t = t.split(':')[1]
except:
pass
t = t.strip()
return t
def extract_time(x):
t = unescape_xml_entities(stripper.transformString(x))
t = t.encode('ascii', 'ignore').decode("utf-8")
t = t.replace(' ', '-')
return t
def extract_audio(x):
t = x.strip('[<audio src="')
t = t.strip('"></audio>]')
return t
def extract_title_name(x):
t = unescape_xml_entities(stripper.transformString(x))
t = t.strip('[]')
return t
# split title and location returning text
def location_split(text):
if isinstance(text, str) and ' at ' in text.lower():
text_split = text.split(' at ')
try:
return text_split[-1]
except:
return None
df['summary'] = df['summary'].apply(lambda x: unescape_xml_entities(stripper.transformString(x)))
df['date'] = df['date'].apply(lambda x: unescape_xml_entities(stripper.transformString(x)))
df['genre'] = df['genre'].apply(lambda x: extract_genre(x))
df['subgenre'] = df['subgenre'].apply(lambda x: extract_genre(x))
df['home'] = df['home'].apply(lambda x: extract_genre(x))
df['event_time'] = df['event_time'].apply(lambda x: extract_time(x))
df['audio'] = df['audio'].apply(lambda x: extract_audio(x))
df['title_and_location'] = df['title_and_location'].apply(lambda x: extract_title_name(x))
df['name'] = df['name'].apply(lambda x: extract_title_name(x))
df['venue'] = df.title_and_location.apply(lambda x: location_split(x))
with ddw.open_remote_file('sparklesquad/sxsw-schedule', 'music_schedule_2019.csv', mode='w') as f:
df.to_csv(f, index=False)
def main():
speaker_schedule()
music_schedule()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment