ahknight/wwdc.py

## wwdc.py
#!/usr/bin/env python3

import argparse
import datetime
import os
import json

import requests
from bs4 import BeautifulSoup

def scrape_session_page(url, event_name=""):
    r = requests.get(url)
    html_doc = r.text

    soup = BeautifulSoup(html_doc)

    sessions = soup.find_all('li', class_='session')
    for session in sessions:
        try:
            session_id  = session.get('id').replace('-video','')
            title       = session.find('li', class_='title')
            track       = session.find('li', class_='track')
            platform    = session.find('li', class_='platform')
            downloads   = session.find('p', class_='download').find_all('a')

            title = title.string if title else ""
            track = track.string if track else ""
            platform = platform.string if platform else ""

            session_name = "%s - Session %s - %s (%s, %s)" % (event_name, session_id, title, track, platform)
            # print(session_name + ":")
        except Exception as e:
            print("ERROR: Could not parse session entry: %r" % e)
            continue

        for link in downloads:
            try:
                link_type = link.string.upper()

                if link_type == 'PDF' and not args.pdf: continue
                if link_type == 'HD' and not args.hd: continue
                if link_type == 'SD' and not args.sd: continue

                link_href = link.get('href')
                link_href = link_href.replace('https://developer.apple.com/devcenter/download.action?path=/', 'http://adcdownload.apple.com/')

                filename = "%s [%s]" % (session_name, link_type)
                if link_type == 'PDF':
                    filename += '.pdf'
                else:
                    filename += '.m4v'

                # print(" %s" % filename)
                # continue

                if link_href.startswith('http'):
                    r = requests.get(link_href, cookies=cookie_jar, stream=True)
                    size = int(r.headers['content-length'])

                    if os.path.exists(filename):
                        s = os.stat(filename)
                        if s.st_size >= size:
                            print("  %s: Done.\r" % (filename,))
                            continue

                    with open(filename, 'wb') as fd:
                        chunk_size = 4*1024*1024
                        progress = 0
                        for chunk in r.iter_content(chunk_size):
                            progress += len(chunk)
                            print("  %s: %0.2f%%\r" % (filename, progress/float(size)*100.0), end="", flush=True)
                            fd.write(chunk)
                    print("") #Newline
            except Exception as e:
                print("")
                print("Error downloading %s: %r" % (filename, e))
                # print(r.headers, r.text)
                continue

today = datetime.datetime.now()
this_year = today.year

parser = argparse.ArgumentParser(description='Download WWDC videos and presentations.')
parser.add_argument('-y', '--year', action='append', type=str, help='The year to download. Can be specified multiple times. Defaults to the current year (%d).' % this_year)
parser.add_argument('--tt', action="store_true", help='Download the media from the current Tech Talks page.')
parser.add_argument('--hd', action="store_true", help='Download the HD version of the session videos.')
parser.add_argument('--sd', action="store_true", help='Download the SD version of the session videos.')
parser.add_argument('--pdf', action="store_true", help='Download the PDFs of the session presentations.')
args = parser.parse_args()

if (args.year == None or len(args.year) == 0) and not args.tt:
    args.year = [str(this_year)]

if not args.pdf and not args.sd and not args.hd:
    args.pdf = args.sd = args.hd = True

with open("wwdc_cookies.json", "r") as fp:
    cookie_jar = json.load(fp)

base_url = 'https://developer.apple.com/videos/wwdc/'
tt_url = 'https://developer.apple.com/tech-talks/videos/'

if args.tt:
    scrape_session_page(tt_url, event_name="iOS 7 Tech Talks")

years = args.year
if years:
    for year in years:
        wwdc_url = "/".join([base_url, year, ""])
        scrape_session_page(wwdc_url, event_name="WWDC %s" % year)
	#!/usr/bin/env python3

	import argparse
	import datetime
	import os
	import json

	import requests
	from bs4 import BeautifulSoup

	def scrape_session_page(url, event_name=""):
	r = requests.get(url)
	html_doc = r.text

	soup = BeautifulSoup(html_doc)

	sessions = soup.find_all('li', class_='session')
	for session in sessions:
	try:
	session_id = session.get('id').replace('-video','')
	title = session.find('li', class_='title')
	track = session.find('li', class_='track')
	platform = session.find('li', class_='platform')
	downloads = session.find('p', class_='download').find_all('a')

	title = title.string if title else ""
	track = track.string if track else ""
	platform = platform.string if platform else ""

	session_name = "%s - Session %s - %s (%s, %s)" % (event_name, session_id, title, track, platform)
	# print(session_name + ":")
	except Exception as e:
	print("ERROR: Could not parse session entry: %r" % e)
	continue

	for link in downloads:
	try:
	link_type = link.string.upper()

	if link_type == 'PDF' and not args.pdf: continue
	if link_type == 'HD' and not args.hd: continue
	if link_type == 'SD' and not args.sd: continue

	link_href = link.get('href')
	link_href = link_href.replace('https://developer.apple.com/devcenter/download.action?path=/', 'http://adcdownload.apple.com/')

	filename = "%s [%s]" % (session_name, link_type)
	if link_type == 'PDF':
	filename += '.pdf'
	else:
	filename += '.m4v'

	# print(" %s" % filename)
	# continue

	if link_href.startswith('http'):
	r = requests.get(link_href, cookies=cookie_jar, stream=True)
	size = int(r.headers['content-length'])

	if os.path.exists(filename):
	s = os.stat(filename)
	if s.st_size >= size:
	print(" %s: Done.\r" % (filename,))
	continue

	with open(filename, 'wb') as fd:
	chunk_size = 410241024
	progress = 0
	for chunk in r.iter_content(chunk_size):
	progress += len(chunk)
	print(" %s: %0.2f%%\r" % (filename, progress/float(size)*100.0), end="", flush=True)
	fd.write(chunk)
	print("") #Newline
	except Exception as e:
	print("")
	print("Error downloading %s: %r" % (filename, e))
	# print(r.headers, r.text)
	continue

	today = datetime.datetime.now()
	this_year = today.year

	parser = argparse.ArgumentParser(description='Download WWDC videos and presentations.')
	parser.add_argument('-y', '--year', action='append', type=str, help='The year to download. Can be specified multiple times. Defaults to the current year (%d).' % this_year)
	parser.add_argument('--tt', action="store_true", help='Download the media from the current Tech Talks page.')
	parser.add_argument('--hd', action="store_true", help='Download the HD version of the session videos.')
	parser.add_argument('--sd', action="store_true", help='Download the SD version of the session videos.')
	parser.add_argument('--pdf', action="store_true", help='Download the PDFs of the session presentations.')
	args = parser.parse_args()

	if (args.year == None or len(args.year) == 0) and not args.tt:
	args.year = [str(this_year)]

	if not args.pdf and not args.sd and not args.hd:
	args.pdf = args.sd = args.hd = True

	with open("wwdc_cookies.json", "r") as fp:
	cookie_jar = json.load(fp)

	base_url = 'https://developer.apple.com/videos/wwdc/'
	tt_url = 'https://developer.apple.com/tech-talks/videos/'

	if args.tt:
	scrape_session_page(tt_url, event_name="iOS 7 Tech Talks")

	years = args.year
	if years:
	for year in years:
	wwdc_url = "/".join([base_url, year, ""])
	scrape_session_page(wwdc_url, event_name="WWDC %s" % year)