CBC Radio 2 broadcast log history download Python script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Python 2.7.x | |
import argparse | |
import csv | |
import datetime | |
import sys | |
from collections import defaultdict | |
# Requires requests and beautifulsoup4: pip install beautifulsoup4 requests | |
import requests | |
from bs4 import BeautifulSoup | |
# Constants | |
date_format = '%Y-%m-%d' | |
default_days_back = 7 | |
cbc_radio_2_broadcast_logs_url = 'http://music.cbc.ca/broadcastlogs/broadcastlogs.aspx' | |
parser = argparse.ArgumentParser(description='Search CBC Radio 2 Broadcast logs') | |
parser.add_argument('--search-artist', default='', dest='search_artist', help='Artist to filter by; if not supplied, all entries will be shown.') | |
parser.add_argument('--start', help='Start date. If not supplied, will search back {} days from the end date'.format(default_days_back)) | |
parser.add_argument('--end', help='End date. If not supplied, the current date is used.') | |
args = parser.parse_args() | |
# Use datetime since only it has a .strptime() class method. | |
end_date = datetime.datetime.strptime(args.end, date_format) if args.end else datetime.datetime.today() | |
start_date = datetime.datetime.strptime(args.start, date_format) if args.start else (end_date - datetime.timedelta(default_days_back)) | |
if start_date > end_date: | |
raise ValueError('Start={} cannot be after end={}'.format(start_date, end_date)) | |
# CSV output to stdout. | |
print '# Results from {} to {}.'.format(start_date.strftime(date_format), end_date.strftime(date_format)) | |
fields = 'date,time,label,artist,composer,album,title,duration'.split(',') | |
csv_writer = csv.DictWriter(sys.stdout, fieldnames=fields) | |
csv_writer.writeheader() | |
date = start_date | |
while (end_date - date).days >= 0: | |
r = requests.get(cbc_radio_2_broadcast_logs_url, params={'broadcastdate': date.strftime(date_format)}) | |
soup = BeautifulSoup(r.text) | |
# Could also use: soup.find_all('div', class_='logShowEntry') | |
entries = soup.select('div.logShowEntry') | |
for entry in entries: | |
# Entry attributes: | |
# Assuming equal numbers of these and that the order matches. | |
dts = entry.find_all('dt') | |
dds = entry.find_all('dd') | |
# NOTE: There is not always a 'composer' entry, so assume that not all | |
# of the dt/dd entries are available. Convert to map. | |
# ALSO: "Choral Concert" entries typically have many more attributes that | |
# will be ignored. | |
# Use a defaultdict in case not all attributes are available. | |
attributes = defaultdict(str) | |
for i in range(len(dts)): | |
# Sometimes, an attribute is empty, so just skip over it. (label is an example) | |
if dts[i].string is None or dds[i].string is None: | |
continue | |
attributes[dts[i].string.strip()] = dds[i].string.strip() | |
artist = attributes['artist'] | |
# Data not in attributes: Assumptions about where data will be. May break over time. | |
attributes['date'] = date.strftime(date_format) | |
attributes['time'] = entry.find('div', class_='logEntryTime').string.strip() | |
attributes['title'] = entry.find('h3').string.strip() | |
# Filter attributes to only the defined field names. | |
# Encode values as UTF-8 since CSV library doesn't support Unicode. | |
attributes = {k:v.encode('utf-8') for (k, v) in attributes.iteritems() if k in fields} | |
# Normalize (for case insensitivity) and do a simple substring match. | |
if artist.lower().find(args.search_artist.strip().lower()) != -1: | |
csv_writer.writerow(attributes) | |
date = date + datetime.timedelta(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment