Skip to content

Instantly share code, notes, and snippets.

@holyspiritomb
Last active October 7, 2022 20:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save holyspiritomb/eece27c6e92d498a2872c57efa78df67 to your computer and use it in GitHub Desktop.
Save holyspiritomb/eece27c6e92d498a2872c57efa78df67 to your computer and use it in GitHub Desktop.
Python 3 script for downloading videos from pbskids.org with SRT captions and converting to mkv with embedded subs. Takes pbskids.org video urls as arguments, fetches the rest. Requires youtube-dl, requests, and standard libraries. Optionally, reencode video with kkroening's ffmpeg library and display color in posix terminal with sty.
#!/usr/bin/env python3
from sys import argv
import json
import re
import requests
import os
from os import path
from youtube_dl import YoutubeDL
# import ffmpeg
# from ffmpeg import probe
green = ""
yellow = ""
reset = ""
# if os.name != "nt":
# from sty import fg
# green = fg(0, 255, 0)
# yellow = fg(255, 255, 0)
# reset = fg.rs
# else:
# pass
inputurls = argv[1:]
# idea: take file list of urls?
myUserAgent = "Mozilla/5.0 (Linux; Android 7.1.2; SM-G610M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.99 Mobile Safari/537.36"
headers = requests.utils.default_headers()
headers.update(
{
"User-Agent": myUserAgent
}
)
def captionblock(page):
r = requests.get(page, headers=headers) # gets webpage
# print("Fetched page")
rhtml = r.text
jsontxt = re.compile("window._PBS_KIDS_DEEPLINK = ({.*?});", re.DOTALL)
matches = jsontxt.search(rhtml) # search html for the json block
jsonblock = matches.group(1) # the json block
# print("Found json block")
jsondict = json.loads(jsonblock)
# print("json loaded into memory")
captiondict = jsondict["video_obj"]["closed_captions"]
return captiondict
def metadata(page):
r = requests.get(page, headers=headers) # gets webpage
# print("Fetched page")
rhtml = r.text
jsontxt = re.compile("window._PBS_KIDS_DEEPLINK = ({.*?});", re.DOTALL)
matches = jsontxt.search(rhtml) # search html for the json block
jsonblock = matches.group(1) # the json block
# print("Found json block")
jsondict = json.loads(jsonblock)
# print("json loaded into memory")
captiondict = jsondict["video_obj"]["closed_captions"]
# print(captiondict)
for i in range(0, len(captiondict)):
captionformat = captiondict[i]["format"]
capurl = captiondict[i]["URI"]
# print(capurl)
if captionformat == "SRT":
captionurl = capurl
# print("found SRT at {}".format(capurl))
break
# captionurl = capurl
vidurl = jsondict["video_obj"]["mp4"]
rawtitle = jsondict["video_obj"]["title"]
title = (
rawtitle.replace("/", ", ")
.replace("&", "and")
.replace(":", "--")
.replace("?", "")
.replace("!", "")
.replace('"', "")
)
return (captionurl, vidurl, title)
def printgreen(message):
print(green + message + reset)
def printyellow(message):
print(yellow + message + reset)
def caption(input_url, output_file):
if path.exists(output_file):
print("")
printgreen("Already fetched captions.")
print("")
else:
captiondl = requests.get(input_url, headers=headers, stream=True)
# i don't remember what this means but it's important?
captiondl.raise_for_status()
# print("Captions stored in memory.")
with open(output_file, "wb") as captionfile:
for block in captiondl.iter_content(1024):
captionfile.write(block)
print("")
printgreen("Captions written to file as {}".format(output_file))
print("")
def videodl(input_url, output_file):
ydl_opts = {"outtmpl": output_file, "quiet": False, "no_warnings": True}
ydl = YoutubeDL(ydl_opts)
ydl.add_default_info_extractors()
ydl.download([input_url]) # ydl.download takes a list as input
# ydl.download's output is ugly as hell and i want a prettier progress bar
meta = ydl.extract_info(input_url, download=False)
print(meta["upload_date"])
# uglyfilename = meta["webpage_url_basename"]
# os.rename(uglyfilename, output_file)
printgreen("Downloaded {}.".format(output_file))
# def subs_convert(inputsubs,outputsubs):
# input_s = ffmpeg.input(inputsubs)
# out = ffmpeg.output(input_s,outputsubs,scodec="webvtt")
# out.run()
# def encode_video(input_vid, input_subs, outfile):
# vinput = ffmpeg.input(input_vid)
# audio = vinput.audio.filter("acopy")
# subs = ffmpeg.input(input_subs)
# r = probe(input_vid)["streams"][0]
# if r["height"] > 720:
# w = int((720 * r["width"]) / r["height"])
# if w % 2 != 0:
# w = w + 1
# video = vinput.video.filter("scale", w, 720)
# else:
# video = vinput.video
# out = ffmpeg.output(
# audio,
# video,
# subs,
# outfile,
# acodec="aac",
# vcodec="libx264",
# scodec="srt", # webvtt for vtt
# disposition="default",
# )
# printyellow("Reencoding. This will probably take a while.")
# out.run(quiet=True) # Eventually: make progress bar of output
if __name__ == "__main__":
episode = [] # the list of dicts of episode metadata
for i in range(0, len(inputurls)):
# fetch all the metadata and store it in memory
captionurl, vidurl, title = metadata(inputurls[i])
keys = ("captionurl", "vidurl", "title")
ep = dict(zip(keys, metadata(inputurls[i]))) # make a dict
episode.append(ep) # add dict to the list of dicts
# number = len(episode)
# if number == 1:
# print("Fetching 1 episode.")
# else:
# print("Fetching {} episodes.".format(number))
for i in range(0, len(inputurls)):
title = episode[i]["title"]
caps = episode[i]["captionurl"]
subtitle_file = "{}.srt".format(title)
print("")
printyellow("Fetching captions for {}".format(title))
printgreen("from url: {}.".format(caps))
caption(caps, subtitle_file)
printyellow("Fetching mp4 file for {}".format(title))
print("")
mp4file = "{}.mp4".format(title)
videodl(episode[i]["vidurl"], mp4file)
# print("")
# printyellow("Reencoding video. Be patient.")
# print("")
# finalfile = "{}.mkv".format(title)
# encode_video(mp4file, subtitle_file, finalfile)
# printgreen("Reencoded {} as {}.".format(mp4file, finalfile))
# print("")
# os.remove(mp4file)
# os.remove(subtitle_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment