Skip to content

Instantly share code, notes, and snippets.

@pianycist pianycist/pbskids-dl.py
Last active May 23, 2019

Embed
What would you like to do?
Python 3 script for downloading videos from pbskids.org with SRT captions. Takes pbskids.org video urls as arguments, fetches the rest. Requires only youtube-dl, requests and standard libraries. Doesn't embed the captions or do any conversion of the mp4 video.
#!/usr/bin/env python3
from sys import argv
import requests
import json
import re
from youtube_dl import YoutubeDL
import os
#script = argv[0] # this line is not actually necessary
inputurls = argv[1:]
urls = []
captions = []
titles = []
rawtitles = []
ydl = YoutubeDL({'outtmpl':'%(id)s.mp4'})
ydl.add_default_info_extractors()
headers = requests.utils.default_headers()
headers.update({ 'User-Agent': 'Mozilla/5.0 (Linux; Android 7.1.2; SM-G610M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.99 Mobile Safari/537.36' })
def fetchurls():
for i in inputurls:
r = requests.get(i,headers=headers) # gets the html
print("Fetched page.")
rawhtml = r.text # returns the html
print("Stored page.")
jsontxt = re.compile('window._PBS_KIDS_DEEPLINK = ({.*?});', re.DOTALL)
matches = jsontxt.search(rawhtml)
jsonblock = matches.group(1) #the json block we need
print("Found json block.")
jsondict = json.loads(jsonblock)
print("Loaded json block.")
# fetch captions
captionurl = jsondict["video_obj"]["closed_captions"][2]["URI"]
captions.append(captionurl)
# fetch url that youtube-dl can use
url = jsondict["video_obj"]["mp4"]
urls.append(url)
# fetch the episode title
rawtitle = jsondict["video_obj"]["title"]
rawtitles.append(rawtitle)
# remove problematic characters for filename
title = rawtitle.replace("/","--").replace(" ","-").replace("","").replace("'","").replace("&","and")
titles.append(title)
# generate a ytd script, optionally
# def scriptgen():
# for i in range(0,total):
# with open("pbsdlscript", 'a') as fscript:
# fscript.write("# {}\n".format(rawtitles[i]))
# fscript.write("youtube-dl '{}' -o '{}.mp4'\n".format(urls[i],titles[i]))
# #fscript.write("# Caption URL: {}".format(captions[i]))
# print("Added {} to script".format(rawtitles[i]))
# the captions!
def getcaptions():
for i in range(0,total):
captiondl = requests.get(captions[i],headers=headers,stream=True)
captiondl.raise_for_status()
print("Got captions for {}".format(rawtitles[i]))
with open("{}.srt".format(titles[i]),'wb') as captionfile:
for block in captiondl.iter_content(1024):
captionfile.write(block)
print("Wrote captions to {}.srt".format(titles[i]))
# the video itself
def getvids():
for i in range(0,total):
print("Downloading {}".format(rawtitles[i]))
vid = []
vid.append(urls[i]) #this is dumb but it works
ydl.download(vid)
meta = ydl.extract_info(urls[i],download=False)
uglyfilename=meta['webpage_url_basename']
prettyfilename="{}.mp4".format(titles[i])
os.rename(uglyfilename,prettyfilename)
print("Downloaded {} as {}, renamed to {}".format(rawtitles[i],uglyfilename,prettyfilename))
fetchurls()
total=len(urls)
#scriptgen()
getcaptions()
getvids()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.