holyspiritomb/pbskids-dl.py

## pbskids-dl.py
#!/usr/bin/env python3

from sys import argv
import json
import re
import requests
import os
from os import path
from youtube_dl import YoutubeDL
# import ffmpeg
# from ffmpeg import probe


green = ""
yellow = ""
reset = ""

# if os.name != "nt":
#     from sty import fg
#     green = fg(0, 255, 0)
#     yellow = fg(255, 255, 0)
#     reset = fg.rs
# else:
#     pass

inputurls = argv[1:]

# idea: take file list of urls?

myUserAgent = "Mozilla/5.0 (Linux; Android 7.1.2; SM-G610M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.99 Mobile Safari/537.36"
headers = requests.utils.default_headers()
headers.update(
    {
        "User-Agent": myUserAgent
    }
)


def captionblock(page):
    r = requests.get(page, headers=headers)  # gets webpage
    # print("Fetched page")
    rhtml = r.text
    jsontxt = re.compile("window._PBS_KIDS_DEEPLINK = ({.*?});", re.DOTALL)
    matches = jsontxt.search(rhtml)  # search html for the json block
    jsonblock = matches.group(1)  # the json block
    # print("Found json block")
    jsondict = json.loads(jsonblock)
    # print("json loaded into memory")
    captiondict = jsondict["video_obj"]["closed_captions"]
    return captiondict


def metadata(page):
    r = requests.get(page, headers=headers)  # gets webpage
    # print("Fetched page")
    rhtml = r.text
    jsontxt = re.compile("window._PBS_KIDS_DEEPLINK = ({.*?});", re.DOTALL)
    matches = jsontxt.search(rhtml)  # search html for the json block
    jsonblock = matches.group(1)  # the json block
    # print("Found json block")
    jsondict = json.loads(jsonblock)
    # print("json loaded into memory")
    captiondict = jsondict["video_obj"]["closed_captions"]
    # print(captiondict)
    for i in range(0, len(captiondict)):
        captionformat = captiondict[i]["format"]
        capurl = captiondict[i]["URI"]
        # print(capurl)
        if captionformat == "SRT":
            captionurl = capurl
            # print("found SRT at {}".format(capurl))
            break
    # captionurl = capurl
    vidurl = jsondict["video_obj"]["mp4"]
    rawtitle = jsondict["video_obj"]["title"]
    title = (
        rawtitle.replace("/", ", ")
        .replace("&", "and")
        .replace(":", "--")
        .replace("?", "")
        .replace("!", "")
        .replace('"', "")
    )
    return (captionurl, vidurl, title)


def printgreen(message):
    print(green + message + reset)


def printyellow(message):
    print(yellow + message + reset)


def caption(input_url, output_file):
    if path.exists(output_file):
        print("")
        printgreen("Already fetched captions.")
        print("")
    else:
        captiondl = requests.get(input_url, headers=headers, stream=True)
        # i don't remember what this means but it's important?
        captiondl.raise_for_status()
        # print("Captions stored in memory.")
        with open(output_file, "wb") as captionfile:
            for block in captiondl.iter_content(1024):
                captionfile.write(block)
        print("")
        printgreen("Captions written to file as {}".format(output_file))
        print("")


def videodl(input_url, output_file):
    ydl_opts = {"outtmpl": output_file, "quiet": False, "no_warnings": True}
    ydl = YoutubeDL(ydl_opts)
    ydl.add_default_info_extractors()
    ydl.download([input_url])  # ydl.download takes a list as input
    # ydl.download's output is ugly as hell and i want a prettier progress bar
    meta = ydl.extract_info(input_url, download=False)
    print(meta["upload_date"])
    # uglyfilename = meta["webpage_url_basename"]
    # os.rename(uglyfilename, output_file)
    printgreen("Downloaded {}.".format(output_file))


# def subs_convert(inputsubs,outputsubs):
#     input_s = ffmpeg.input(inputsubs)
#     out = ffmpeg.output(input_s,outputsubs,scodec="webvtt")
#     out.run()


# def encode_video(input_vid, input_subs, outfile):
#     vinput = ffmpeg.input(input_vid)
#     audio = vinput.audio.filter("acopy")
#     subs = ffmpeg.input(input_subs)
#     r = probe(input_vid)["streams"][0]
#     if r["height"] > 720:
#         w = int((720 * r["width"]) / r["height"])
#         if w % 2 != 0:
#             w = w + 1
#         video = vinput.video.filter("scale", w, 720)
#     else:
#         video = vinput.video
#     out = ffmpeg.output(
#         audio,
#         video,
#         subs,
#         outfile,
#         acodec="aac",
#         vcodec="libx264",
#         scodec="srt",  # webvtt for vtt
#         disposition="default",
#     )
#     printyellow("Reencoding. This will probably take a while.")
#     out.run(quiet=True) # Eventually: make progress bar of output


if __name__ == "__main__":

    episode = []  # the list of dicts of episode metadata

    for i in range(0, len(inputurls)):
        # fetch all the metadata and store it in memory
        captionurl, vidurl, title = metadata(inputurls[i])
        keys = ("captionurl", "vidurl", "title")
        ep = dict(zip(keys, metadata(inputurls[i])))  # make a dict
        episode.append(ep)  # add dict to the list of dicts

    # number = len(episode)

    # if number == 1:
    #     print("Fetching 1 episode.")
    # else:
    #     print("Fetching {} episodes.".format(number))

    for i in range(0, len(inputurls)):
        title = episode[i]["title"]
        caps = episode[i]["captionurl"]
        subtitle_file = "{}.srt".format(title)
        print("")
        printyellow("Fetching captions for {}".format(title))
        printgreen("from url: {}.".format(caps))
        caption(caps, subtitle_file)
        printyellow("Fetching mp4 file for {}".format(title))
        print("")
        mp4file = "{}.mp4".format(title)
        videodl(episode[i]["vidurl"], mp4file)
        # print("")
        # printyellow("Reencoding video. Be patient.")
        # print("")
        # finalfile = "{}.mkv".format(title)
        # encode_video(mp4file, subtitle_file, finalfile)
        # printgreen("Reencoded {} as {}.".format(mp4file, finalfile))
        # print("")
        # os.remove(mp4file)
        # os.remove(subtitle_file)
	#!/usr/bin/env python3

	from sys import argv
	import json
	import re
	import requests
	import os
	from os import path
	from youtube_dl import YoutubeDL
	# import ffmpeg
	# from ffmpeg import probe


	green = ""
	yellow = ""
	reset = ""

	# if os.name != "nt":
	# from sty import fg
	# green = fg(0, 255, 0)
	# yellow = fg(255, 255, 0)
	# reset = fg.rs
	# else:
	# pass

	inputurls = argv[1:]

	# idea: take file list of urls?

	myUserAgent = "Mozilla/5.0 (Linux; Android 7.1.2; SM-G610M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.99 Mobile Safari/537.36"
	headers = requests.utils.default_headers()
	headers.update(
	{
	"User-Agent": myUserAgent
	}
	)


	def captionblock(page):
	r = requests.get(page, headers=headers) # gets webpage
	# print("Fetched page")
	rhtml = r.text
	jsontxt = re.compile("window._PBS_KIDS_DEEPLINK = ({.*?});", re.DOTALL)
	matches = jsontxt.search(rhtml) # search html for the json block
	jsonblock = matches.group(1) # the json block
	# print("Found json block")
	jsondict = json.loads(jsonblock)
	# print("json loaded into memory")
	captiondict = jsondict["video_obj"]["closed_captions"]
	return captiondict


	def metadata(page):
	r = requests.get(page, headers=headers) # gets webpage
	# print("Fetched page")
	rhtml = r.text
	jsontxt = re.compile("window._PBS_KIDS_DEEPLINK = ({.*?});", re.DOTALL)
	matches = jsontxt.search(rhtml) # search html for the json block
	jsonblock = matches.group(1) # the json block
	# print("Found json block")
	jsondict = json.loads(jsonblock)
	# print("json loaded into memory")
	captiondict = jsondict["video_obj"]["closed_captions"]
	# print(captiondict)
	for i in range(0, len(captiondict)):
	captionformat = captiondict[i]["format"]
	capurl = captiondict[i]["URI"]
	# print(capurl)
	if captionformat == "SRT":
	captionurl = capurl
	# print("found SRT at {}".format(capurl))
	break
	# captionurl = capurl
	vidurl = jsondict["video_obj"]["mp4"]
	rawtitle = jsondict["video_obj"]["title"]
	title = (
	rawtitle.replace("/", ", ")
	.replace("&", "and")
	.replace(":", "--")
	.replace("?", "")
	.replace("!", "")
	.replace('"', "")
	)
	return (captionurl, vidurl, title)


	def printgreen(message):
	print(green + message + reset)


	def printyellow(message):
	print(yellow + message + reset)


	def caption(input_url, output_file):
	if path.exists(output_file):
	print("")
	printgreen("Already fetched captions.")
	print("")
	else:
	captiondl = requests.get(input_url, headers=headers, stream=True)
	# i don't remember what this means but it's important?
	captiondl.raise_for_status()
	# print("Captions stored in memory.")
	with open(output_file, "wb") as captionfile:
	for block in captiondl.iter_content(1024):
	captionfile.write(block)
	print("")
	printgreen("Captions written to file as {}".format(output_file))
	print("")


	def videodl(input_url, output_file):
	ydl_opts = {"outtmpl": output_file, "quiet": False, "no_warnings": True}
	ydl = YoutubeDL(ydl_opts)
	ydl.add_default_info_extractors()
	ydl.download([input_url]) # ydl.download takes a list as input
	# ydl.download's output is ugly as hell and i want a prettier progress bar
	meta = ydl.extract_info(input_url, download=False)
	print(meta["upload_date"])
	# uglyfilename = meta["webpage_url_basename"]
	# os.rename(uglyfilename, output_file)
	printgreen("Downloaded {}.".format(output_file))


	# def subs_convert(inputsubs,outputsubs):
	# input_s = ffmpeg.input(inputsubs)
	# out = ffmpeg.output(input_s,outputsubs,scodec="webvtt")
	# out.run()


	# def encode_video(input_vid, input_subs, outfile):
	# vinput = ffmpeg.input(input_vid)
	# audio = vinput.audio.filter("acopy")
	# subs = ffmpeg.input(input_subs)
	# r = probe(input_vid)["streams"][0]
	# if r["height"] > 720:
	# w = int((720 * r["width"]) / r["height"])
	# if w % 2 != 0:
	# w = w + 1
	# video = vinput.video.filter("scale", w, 720)
	# else:
	# video = vinput.video
	# out = ffmpeg.output(
	# audio,
	# video,
	# subs,
	# outfile,
	# acodec="aac",
	# vcodec="libx264",
	# scodec="srt", # webvtt for vtt
	# disposition="default",
	# )
	# printyellow("Reencoding. This will probably take a while.")
	# out.run(quiet=True) # Eventually: make progress bar of output


	if __name__ == "__main__":

	episode = [] # the list of dicts of episode metadata

	for i in range(0, len(inputurls)):
	# fetch all the metadata and store it in memory
	captionurl, vidurl, title = metadata(inputurls[i])
	keys = ("captionurl", "vidurl", "title")
	ep = dict(zip(keys, metadata(inputurls[i]))) # make a dict
	episode.append(ep) # add dict to the list of dicts

	# number = len(episode)

	# if number == 1:
	# print("Fetching 1 episode.")
	# else:
	# print("Fetching {} episodes.".format(number))

	for i in range(0, len(inputurls)):
	title = episode[i]["title"]
	caps = episode[i]["captionurl"]
	subtitle_file = "{}.srt".format(title)
	print("")
	printyellow("Fetching captions for {}".format(title))
	printgreen("from url: {}.".format(caps))
	caption(caps, subtitle_file)
	printyellow("Fetching mp4 file for {}".format(title))
	print("")
	mp4file = "{}.mp4".format(title)
	videodl(episode[i]["vidurl"], mp4file)
	# print("")
	# printyellow("Reencoding video. Be patient.")
	# print("")
	# finalfile = "{}.mkv".format(title)
	# encode_video(mp4file, subtitle_file, finalfile)
	# printgreen("Reencoded {} as {}.".format(mp4file, finalfile))
	# print("")
	# os.remove(mp4file)
	# os.remove(subtitle_file)