Last active
October 7, 2022 20:36
-
-
Save holyspiritomb/eece27c6e92d498a2872c57efa78df67 to your computer and use it in GitHub Desktop.
Python 3 script for downloading videos from pbskids.org with SRT captions and converting to mkv with embedded subs. Takes pbskids.org video urls as arguments, fetches the rest. Requires youtube-dl, requests, and standard libraries. Optionally, reencode video with kkroening's ffmpeg library and display color in posix terminal with sty.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from sys import argv | |
import json | |
import re | |
import requests | |
import os | |
from os import path | |
from youtube_dl import YoutubeDL | |
# import ffmpeg | |
# from ffmpeg import probe | |
green = "" | |
yellow = "" | |
reset = "" | |
# if os.name != "nt": | |
# from sty import fg | |
# green = fg(0, 255, 0) | |
# yellow = fg(255, 255, 0) | |
# reset = fg.rs | |
# else: | |
# pass | |
inputurls = argv[1:] | |
# idea: take file list of urls? | |
myUserAgent = "Mozilla/5.0 (Linux; Android 7.1.2; SM-G610M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.99 Mobile Safari/537.36" | |
headers = requests.utils.default_headers() | |
headers.update( | |
{ | |
"User-Agent": myUserAgent | |
} | |
) | |
def captionblock(page): | |
r = requests.get(page, headers=headers) # gets webpage | |
# print("Fetched page") | |
rhtml = r.text | |
jsontxt = re.compile("window._PBS_KIDS_DEEPLINK = ({.*?});", re.DOTALL) | |
matches = jsontxt.search(rhtml) # search html for the json block | |
jsonblock = matches.group(1) # the json block | |
# print("Found json block") | |
jsondict = json.loads(jsonblock) | |
# print("json loaded into memory") | |
captiondict = jsondict["video_obj"]["closed_captions"] | |
return captiondict | |
def metadata(page): | |
r = requests.get(page, headers=headers) # gets webpage | |
# print("Fetched page") | |
rhtml = r.text | |
jsontxt = re.compile("window._PBS_KIDS_DEEPLINK = ({.*?});", re.DOTALL) | |
matches = jsontxt.search(rhtml) # search html for the json block | |
jsonblock = matches.group(1) # the json block | |
# print("Found json block") | |
jsondict = json.loads(jsonblock) | |
# print("json loaded into memory") | |
captiondict = jsondict["video_obj"]["closed_captions"] | |
# print(captiondict) | |
for i in range(0, len(captiondict)): | |
captionformat = captiondict[i]["format"] | |
capurl = captiondict[i]["URI"] | |
# print(capurl) | |
if captionformat == "SRT": | |
captionurl = capurl | |
# print("found SRT at {}".format(capurl)) | |
break | |
# captionurl = capurl | |
vidurl = jsondict["video_obj"]["mp4"] | |
rawtitle = jsondict["video_obj"]["title"] | |
title = ( | |
rawtitle.replace("/", ", ") | |
.replace("&", "and") | |
.replace(":", "--") | |
.replace("?", "") | |
.replace("!", "") | |
.replace('"', "") | |
) | |
return (captionurl, vidurl, title) | |
def printgreen(message): | |
print(green + message + reset) | |
def printyellow(message): | |
print(yellow + message + reset) | |
def caption(input_url, output_file): | |
if path.exists(output_file): | |
print("") | |
printgreen("Already fetched captions.") | |
print("") | |
else: | |
captiondl = requests.get(input_url, headers=headers, stream=True) | |
# i don't remember what this means but it's important? | |
captiondl.raise_for_status() | |
# print("Captions stored in memory.") | |
with open(output_file, "wb") as captionfile: | |
for block in captiondl.iter_content(1024): | |
captionfile.write(block) | |
print("") | |
printgreen("Captions written to file as {}".format(output_file)) | |
print("") | |
def videodl(input_url, output_file): | |
ydl_opts = {"outtmpl": output_file, "quiet": False, "no_warnings": True} | |
ydl = YoutubeDL(ydl_opts) | |
ydl.add_default_info_extractors() | |
ydl.download([input_url]) # ydl.download takes a list as input | |
# ydl.download's output is ugly as hell and i want a prettier progress bar | |
meta = ydl.extract_info(input_url, download=False) | |
print(meta["upload_date"]) | |
# uglyfilename = meta["webpage_url_basename"] | |
# os.rename(uglyfilename, output_file) | |
printgreen("Downloaded {}.".format(output_file)) | |
# def subs_convert(inputsubs,outputsubs): | |
# input_s = ffmpeg.input(inputsubs) | |
# out = ffmpeg.output(input_s,outputsubs,scodec="webvtt") | |
# out.run() | |
# def encode_video(input_vid, input_subs, outfile): | |
# vinput = ffmpeg.input(input_vid) | |
# audio = vinput.audio.filter("acopy") | |
# subs = ffmpeg.input(input_subs) | |
# r = probe(input_vid)["streams"][0] | |
# if r["height"] > 720: | |
# w = int((720 * r["width"]) / r["height"]) | |
# if w % 2 != 0: | |
# w = w + 1 | |
# video = vinput.video.filter("scale", w, 720) | |
# else: | |
# video = vinput.video | |
# out = ffmpeg.output( | |
# audio, | |
# video, | |
# subs, | |
# outfile, | |
# acodec="aac", | |
# vcodec="libx264", | |
# scodec="srt", # webvtt for vtt | |
# disposition="default", | |
# ) | |
# printyellow("Reencoding. This will probably take a while.") | |
# out.run(quiet=True) # Eventually: make progress bar of output | |
if __name__ == "__main__": | |
episode = [] # the list of dicts of episode metadata | |
for i in range(0, len(inputurls)): | |
# fetch all the metadata and store it in memory | |
captionurl, vidurl, title = metadata(inputurls[i]) | |
keys = ("captionurl", "vidurl", "title") | |
ep = dict(zip(keys, metadata(inputurls[i]))) # make a dict | |
episode.append(ep) # add dict to the list of dicts | |
# number = len(episode) | |
# if number == 1: | |
# print("Fetching 1 episode.") | |
# else: | |
# print("Fetching {} episodes.".format(number)) | |
for i in range(0, len(inputurls)): | |
title = episode[i]["title"] | |
caps = episode[i]["captionurl"] | |
subtitle_file = "{}.srt".format(title) | |
print("") | |
printyellow("Fetching captions for {}".format(title)) | |
printgreen("from url: {}.".format(caps)) | |
caption(caps, subtitle_file) | |
printyellow("Fetching mp4 file for {}".format(title)) | |
print("") | |
mp4file = "{}.mp4".format(title) | |
videodl(episode[i]["vidurl"], mp4file) | |
# print("") | |
# printyellow("Reencoding video. Be patient.") | |
# print("") | |
# finalfile = "{}.mkv".format(title) | |
# encode_video(mp4file, subtitle_file, finalfile) | |
# printgreen("Reencoded {} as {}.".format(mp4file, finalfile)) | |
# print("") | |
# os.remove(mp4file) | |
# os.remove(subtitle_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment