Created
January 24, 2018 20:16
-
-
Save miketahani/08dc408ad9c3b142fc335bd13a13e411 to your computer and use it in GitHub Desktop.
egghead.io series scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
# one-off scraper for egghead.io videos in a series | |
# usage: chmod +x grabber.py && ./grabber.py <series stub> | |
# requires youtube-dl (`brew install youtube-dl`) | |
import os | |
import subprocess | |
import json | |
import argparse | |
from urllib.request import urlretrieve | |
# TODO refactor to allow user to download series (playlist of videos), | |
# lesson (single video), or whatever else there is on the site | |
parser = argparse.ArgumentParser() | |
parser.add_argument('series_stub', help='course/series stub (https://egghead.io/courses/<series_stub>)', type=str) | |
args = parser.parse_args() | |
# series_stub = 'building-react-applications-with-idiomatic-redux' | |
series_stub = args.series_stub | |
manifest_url = 'https://egghead.io/api/v1/series/{}/lessons'.format(series_stub) | |
output_dir = series_stub | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
local_manifest_filename = '{}/{}.manifest.json'.format(output_dir, series_stub) | |
if not os.path.exists(local_manifest_filename): | |
urlretrieve(manifest_url, local_manifest_filename) | |
with open(local_manifest_filename, 'r') as manifest_file: | |
manifest = json.loads(manifest_file.read()) | |
# sort manifest by ids descending so we grab the videos in order | |
manifest = sorted(manifest, key=lambda vid: int(vid['id'])) | |
# get the total number of videos for the log output below | |
num_videos = len(manifest) | |
for (idx, video) in enumerate(manifest): | |
video_metadata = { | |
'id': idx + 1, | |
'slug': video['slug'], | |
'url': video['http_url'] | |
} | |
# path template for youtube-dl | |
path_template = '{output_dir}/{id:03d}.{slug}.%(ext)s'.format(output_dir=output_dir, **video_metadata) | |
# figure out the full local file path | |
local_video_path = subprocess.check_output(['youtube-dl', '--get-filename', '-o', path_template, video_metadata['url'], '--restrict-filenames']) | |
# convert path from bytes to utf-8 string, strip trailing subprocess.check_output "\n" | |
local_video_path = local_video_path.decode('utf-8').strip() | |
print('\n[+] video ({id} of {total}): {slug}'.format(total=num_videos, **video_metadata)) | |
# get the video with youtube-dl | |
subprocess.run(['youtube-dl', '--no-overwrites', '--continue', '-o', local_video_path, video_metadata['url']]) | |
print('\ndone. have a nice day!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment