Skip to content

Instantly share code, notes, and snippets.

@pchng
Created March 9, 2015 02:56
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save pchng/88e3f4e724b7c6b8763c to your computer and use it in GitHub Desktop.
CBC Radio 2 broadcast log history download Python script
#!/usr/bin/env python
# Python 2.7.x
import argparse
import csv
import datetime
import sys
from collections import defaultdict
# Requires requests and beautifulsoup4: pip install beautifulsoup4 requests
import requests
from bs4 import BeautifulSoup
# Constants
date_format = '%Y-%m-%d'
default_days_back = 7
cbc_radio_2_broadcast_logs_url = 'http://music.cbc.ca/broadcastlogs/broadcastlogs.aspx'
parser = argparse.ArgumentParser(description='Search CBC Radio 2 Broadcast logs')
parser.add_argument('--search-artist', default='', dest='search_artist', help='Artist to filter by; if not supplied, all entries will be shown.')
parser.add_argument('--start', help='Start date. If not supplied, will search back {} days from the end date'.format(default_days_back))
parser.add_argument('--end', help='End date. If not supplied, the current date is used.')
args = parser.parse_args()
# Use datetime since only it has a .strptime() class method.
end_date = datetime.datetime.strptime(args.end, date_format) if args.end else datetime.datetime.today()
start_date = datetime.datetime.strptime(args.start, date_format) if args.start else (end_date - datetime.timedelta(default_days_back))
if start_date > end_date:
raise ValueError('Start={} cannot be after end={}'.format(start_date, end_date))
# CSV output to stdout.
print '# Results from {} to {}.'.format(start_date.strftime(date_format), end_date.strftime(date_format))
fields = 'date,time,label,artist,composer,album,title,duration'.split(',')
csv_writer = csv.DictWriter(sys.stdout, fieldnames=fields)
csv_writer.writeheader()
date = start_date
while (end_date - date).days >= 0:
r = requests.get(cbc_radio_2_broadcast_logs_url, params={'broadcastdate': date.strftime(date_format)})
soup = BeautifulSoup(r.text)
# Could also use: soup.find_all('div', class_='logShowEntry')
entries = soup.select('div.logShowEntry')
for entry in entries:
# Entry attributes:
# Assuming equal numbers of these and that the order matches.
dts = entry.find_all('dt')
dds = entry.find_all('dd')
# NOTE: There is not always a 'composer' entry, so assume that not all
# of the dt/dd entries are available. Convert to map.
# ALSO: "Choral Concert" entries typically have many more attributes that
# will be ignored.
# Use a defaultdict in case not all attributes are available.
attributes = defaultdict(str)
for i in range(len(dts)):
# Sometimes, an attribute is empty, so just skip over it. (label is an example)
if dts[i].string is None or dds[i].string is None:
continue
attributes[dts[i].string.strip()] = dds[i].string.strip()
artist = attributes['artist']
# Data not in attributes: Assumptions about where data will be. May break over time.
attributes['date'] = date.strftime(date_format)
attributes['time'] = entry.find('div', class_='logEntryTime').string.strip()
attributes['title'] = entry.find('h3').string.strip()
# Filter attributes to only the defined field names.
# Encode values as UTF-8 since CSV library doesn't support Unicode.
attributes = {k:v.encode('utf-8') for (k, v) in attributes.iteritems() if k in fields}
# Normalize (for case insensitivity) and do a simple substring match.
if artist.lower().find(args.search_artist.strip().lower()) != -1:
csv_writer.writerow(attributes)
date = date + datetime.timedelta(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment