-
-
Save amcgregor/666fa2b3ce673ab15ebcf2420353374b to your computer and use it in GitHub Desktop.
What I learned from gaining personal control over my viewing data, implementing my own ghetto version of podsync.net.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# File at root level included by each channel's specific Makefile..PHONY: all ingest pull render help .latest index.rss | |
# Base configuration: file naming scheme. | |
options := -o '%(upload_date)s--%(id)s--%(title)s--%(resolution)s.%(ext)s' | |
# Global Transfer Options | |
options += --progress | |
options += --concurrent-fragments 3 | |
options += --continue | |
options += --ignore-errors | |
options += --restrict-filenames | |
options += --no-overwrites | |
options += --no-playlist | |
options += --xattr-set-filesize | |
options += --write-info-json | |
# Rate Limiting | |
# options += --max-downloads 20 | |
# options += --limit-rate 1M | |
# options += --sleep-interval 15 --max-sleep-interval 45 | |
# Download, Tracking & Privacy | |
options += --paths temp:/tmp/cast | |
options += --download-archive /Volumes/Podcasts/·\ Workspace/_archive.ids | |
options += --no-call-home | |
options += --no-mark-watched | |
# Content | |
options += --format '137+258/137+140/700+140/699+140/299+140/bestvideo[ext=mp4]+bestaudio[ext=m4a]' | |
# Processing | |
options += --merge-output-format mp4 | |
options += --embed-subs | |
options += --embed-metadata | |
options += --embed-thumbnail | |
options += --sub-format best | |
options += --sub-langs all,-live_chat | |
# Common Filters | |
options += --reject-title "(Elden Ring|Stranger Things|Sims 4|Persona)" | |
# options += --dateafter today-90day | |
# options += --dateafter 20210101 | |
# options += --match-filter 'duration < 3600' | |
# options += --reject-title "" | |
ingestopts := --dateafter 20210101 | |
# Phony productions for ease of end-user use. | |
all: .latest index.rss ## Pull the latest changes and render the feed. | |
help: ## Show this help message and exit. | |
@echo "Usage: make <command>\n\033[36m\033[0m" | |
@awk 'BEGIN {FS = ":.*##"} /^[a-zA-Z_-]+:.*?##/ { printf "\033[36m%-18s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) | sort | |
ingest: ## Perform initial channel content ingest. | |
@mkdir -p /tmp/cast | |
-yt-dlp $(options) $(ingestopts) $(uri) | |
@touch .latest | |
pull: ## Perform a periodic poll for newer content. | |
@mkdir -p /tmp/cast | |
-yt-dlp $(options) --playlist-end 20 $(uri) | |
@touch .latest | |
render: index.rss ## Render the RSS feed if needed. | |
# Actual productions. | |
.latest: pull | |
index.rss: . # Possibility- use calculated (known) files, and current directory to catch additions? | |
@(test \! -e index.rss || find . -iname \*.json -newer index.rss -print -quit | grep -q .) && render.py . || echo "Up-to-date." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Top-level Makefile, in directory containing subdirectories per channel or playlist. | |
.PHONY: pull render active missing help | |
## Basic operations. | |
# Adjust the -j value to the number of CPU cores you have, and/or what your internet connection is capable of handling. | |
# The list of active subdirectories, to poll and update, must be stored one per line in a file named .active | |
pull: .active ## Pull the latest episodes of all active podcasts. | |
@time cat .active | parallel -j 8 --progress --eta --shuf --line-buffer 'make -C {}' || true | |
@touch .latest | |
render: .active ## Render all active podcasts, as needed. | |
@time cat .active | parallel -j 8 --progress --bar --eta --shuf 'make -C {} render > /dev/null' | |
#render-all: # Render all active RSS feeds. | |
# @for i in $(cat .active); do (echo -e "\n$$i"; cd "$$i"; make render); done | |
## Podcast metadata. | |
active: .active ## Discover active podcasts. | |
.active: | |
@for i in */Makefile; do echo "$(dirname $i)"; done | tee .active | |
missing: .missing ## Discover podcasts requiring configuration. | |
.missing: | |
@find . -depth 1 -type d -not -name .\* -not -name ·\* '!' -exec test -e "{}/Makefile" ';' -exec basename {} \; | tee .missing | |
## | |
help: ## Show this help message and exit. | |
@echo "Usage: \033[1mmake\033[0m \033[4mcommand\033[0m\n\033[36m\033[0m" | |
@awk 'BEGIN {FS = ":.*##"} /^[a-zA-Z_-]+:.*?##/ { printf "\033[36m%-10s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) | |
@echo |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A Makefile placed within each channel or playlist directory. | |
# May extend or override the variables used by the actual command invocations. | |
# MINIMUM | |
include ../global.mk | |
uri := "https://www.youtube.com/..." | |
# ADDITIONAL | |
options += --match-filter "duration < 3600" # E.g. restrict to video shorter than one hour. | |
ingestopts := --dateafter today-90day # E.g. when performing initial ingest, limit to the most recent 90 days. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This is the old approach. | |
# For the current one, please reference the above Makefiles. | |
# Automate the execution of youtube-dl with appropriate options for archival and data necessary to construct a | |
# podcast feed, with variations on exact criteria and limits, and moderate parallelization. | |
# | |
# Usage: | |
# ./pull.sh | |
# | |
# That's it. ;^P Okay, well, you're also going to want to customize this file to choose your own channels and | |
# playlists to pull in; the pattern in the indented block should be readily apparent. Then you're going to want to | |
# serve the content from the location you run this script in, for which I recommend Nginx. | |
# (Don't forget to set up SSL. ;) | |
# | |
# brew install nginx-full --with-accept-language-module --with-brotli-module --with-gzip-static --with-http2 --with-nchan-module | |
# | |
# Example configuration: (generally goes in /etc/nginx/servers/<yourserver>.conf) | |
# | |
# server { | |
# listen 80; | |
# server_name cast.example.com; | |
# | |
# location / { | |
# index index.rss; | |
# root /path/to/your/download/location; | |
# } | |
# } | |
# | |
# NOTE: patch applied ("break" added) to youtube.py line 291 or so, to prevent pulling multiple pages of videos. | |
# NOTE: patch applied ("sys.exit(0)" added) upon encountering the first video rejected due to date-before criteria. | |
# The last because the playlists come in chronological order, newest to oldest. Hit one that's too old, | |
# every subseuqent one will be older than that and fail, too. No need to waste HTTP requests pulling details in. | |
# | |
# If you've followed the instructions in render.py, you can find the path to the file to patch by running: | |
# python3 -me youtube_dl.extractor.youtube | |
BASEOPTS="--no-call-home --ignore-errors --continue --no-overwrites --download-archive _archive.ids --cache-dir Cache --no-mark-watched --yes-playlist --restrict-filenames --write-info-json --write-thumbnail --embed-thumbnail --write-sub --embed-subs --merge-output-format mp4 --sub-format best --add-metadata --youtube-skip-dash-manifest --xattr-set-filesize -o '%(playlist)s/%(upload_date)s--%(id)s--%(title)s--%(resolution)s.%(ext)s'" | |
# BASEOPTS="$BASEOPTS --proxy \"socks5://127.0.0.1:4088/\"" # If you end up getting rate limited. | |
# BASEOPTS="$BASEOPTS --limit-rate 2M" | |
# BASEOPTS="$BASEOPTS --sleep-interval 5 --max-sleep-interval 90" | |
# Generally we request 1080p H.264 M4V + M4A audio. | |
# Certain channels don't do 1080p, so we fall back on the best available. | |
BASEOPTS="$BASEOPTS --format '137+140/bestvideo[ext=mp4]+bestaudio[ext=m4a]'" | |
# Variations. | |
# Unrestricted. | |
UNLIMITED=$BASEOPTS | |
# In general, I only care about videos uploaded within the last quarter. | |
BASEOPTS="$BASEOPTS --dateafter today-3month" | |
# We also don't care to download too many videos from one channel at a time. | |
# Some playlists/channels are added for the purpose of bulk archival, though. | |
BULK="$BASEOPTS --max-downloads 50" | |
BASEOPTS="$BASEOPTS --max-downloads 25" | |
parallel --line-buffer -j 3 --shuf <<VIDEOS | |
# Newsy Type Things and Stuff | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UClFSU9_bUb4Rc6OYfTt5SPw | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCy6kyFxaMqGtpE3pQTflK8A | |
# ... | |
# Educational | |
youtube-dl ${UNLIMITED} https://www.youtube.com/channel/UCOGeU-1Fig3rrDjhm9Zs_wg | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCo8bcnLyZH8tBIH9V1mLgqQ | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UC7_gcs09iThXybpVgjHZ_7g | |
youtube-dl ${UNLIMITED} https://www.youtube.com/channel/UC9-y-6csu5WGm29I7JiwpnA | |
youtube-dl ${UNLIMITED} https://www.youtube.com/channel/UCoxcjq-8xIDTYp3uz647V5A | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCUHW94eEFW7hkUMVaZz4eDg | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q | |
# ... | |
# Gaming | |
youtube-dl ${BASEOPTS} --reject-title "(off topic|between the games|ready set show|let's roll|AHWU)" https://www.youtube.com/channel/UCsB0LwkHPWyjfZ-JwvtwEXw | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCRHXUZ0BxbkU2MYZgsuFgkQ | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCMdGPato0IC5-zZjJIf-P9w | |
# ... | |
# Entertainment | |
youtube-dl ${UNLIMITED} https://www.youtube.com/playlist?list=PLAbMhAYRuCUhC85vZRvBBdYPJk-9pLN-8 | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCo_IB5145EVNcf8hw1Kku7w | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UC3sznuotAs2ohg_U__Jzj_Q | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCqTYHSnBUXZamsVcOlQf-fg | |
youtube-dl ${BULK} https://www.youtube.com/channel/UCeE3lj6pLX_gCd0Yvns517Q | |
youtube-dl ${UNLIMITED} https://www.youtube.com/channel/UCuoMasRkMhlj1VNVAOJdw5w | |
youtube-dl ${UNLIMITED} https://www.youtube.com/channel/UCZOnoLKzoBItcEk5OsES2TA | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCYUQQgogVeQY8cMQamhHJcg | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCZFipeZtQM5CKUjx6grh54g # Spaaaaace... | |
# Music | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCshiNtfJ7Dj3nlh41a6M-kg | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UCgFvT6pUq9HLOvKBYERzXSQ | |
youtube-dl ${BASEOPTS} https://www.youtube.com/channel/UC0bVrLK0otdvTm_KWKWDgtQ | |
youtube-dl ${UNLIMITED} https://www.youtube.com/channel/UCMu5gPmKp5av0QCAajKTMhw | |
VIDEOS | |
# Generate RSS feeds from the videos on-disk. | |
find . -type d -depth 1 | parallel --bar --progress "./render.py {} > /dev/null" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Transform a directory of youtube-dl downloads into a Podcast. | |
At a fundamental level, all of the data is sitting there in '.info.json' files, it just needs to be collected | |
together and formatted as an appropriately populated RSS feed. | |
Requires Python 3.7. Recommend installation within a virtual environment (python3 -m venv). | |
Requires one additional library, the template engine, youtube-dl, of course, and e is very handy. Run: | |
pip3 install youtube-dl cinje e | |
Make sure you also mark these files as executable: | |
chmod +x pull.sh render.py | |
The reason for the use of str() repeatedly in the template is that bare use of URL objects will generate an actual | |
<a href=""> link. We naively assume the first video in the set is demonstrative of the whole, i.e. one directory, | |
one creator. | |
""" | |
from datetime import datetime | |
from sys import argv, exit | |
from pathlib import Path | |
from uri import URI | |
from json import loads | |
from math import floor | |
from dataclasses import dataclass | |
from typing import Any | |
from cinje import flatten | |
from tmpl import rss | |
BASE = URI("https://cast.example.com") | |
def dur(point: int) -> str: | |
"""Generate a textual duration (minutes:seconds) from an integer number of seconds.""" | |
minutes = floor(point / 60) | |
seconds = point - (minutes * 60) | |
return f"{minutes:d}:{seconds:02d}" | |
@dataclass | |
class Episode: | |
"""Metadata representation.""" | |
identifier: str | |
channel: str | |
channel_url: URI | |
title: str | |
description: str | |
duration: str | |
size: int | |
uploaded: datetime | |
link: URI | |
video: URI | |
thumbnail: URI | |
data: dict | |
def __repr__(self): | |
return f'Episode({self.identifier}, "{self.title}", from="{self.channel}")' | |
@classmethod | |
def from_json(Episode, path: Path, base: URI = BASE): | |
info = loads(path.read_text(encoding='utf-8')) | |
return Episode( | |
identifier = info['id'], | |
channel = info['uploader'], | |
channel_url = URI(info['channel_url']), | |
title = info['title'], | |
description = info['description'], | |
duration = dur(info['duration']), | |
size = Path(info['_filename']).stat().st_size, | |
uploaded = datetime.strptime(info['upload_date'], "%Y%m%d"), | |
link = URI(info['webpage_url']), | |
video = base / info['_filename'], | |
thumbnail = base / info['_filename'].replace('.mp4', '.jpg'), | |
data = info | |
) | |
def discover(path: Path, base: URI = BASE): | |
infos = set(path.glob('*.info.json')) | |
videos = {video.with_suffix('.info.json') for video in path.glob('*.mp4')} | |
matched = infos & videos # We only care about matched pairs. | |
for episode in sorted(matched): | |
video = Episode.from_json(episode, base) | |
print("Collected:", video) | |
yield video | |
def process(path: Path): | |
print("Processing:", path) | |
now: datetime = datetime.utcnow() | |
base: Path = Path(path) | |
episodes = list(discover(path)) | |
playlist: bool = 'Uploads_from_' not in str(path) | |
if not episodes: | |
print("Skipping:", path) | |
return | |
channel = episodes[0] | |
recency: datetime = max(i.uploaded for i in episodes) | |
with (path / 'index.rss').open('w', encoding='utf-8') as writer: | |
print(f"Rendering: {path / 'index.rss'!s}") | |
feed = rss(episodes, now, recency, channel.data['playlist_title'] if playlist else channel.channel, channel.channel_url, channel.data['playlist_title']) | |
result = flatten(feed, writer) | |
print("Done.") | |
def main(*paths) -> int: | |
for path in paths: | |
process(Path(path)) | |
return 0 | |
if __name__ == '__main__': | |
for arg in argv[1:]: | |
main(arg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: cinje | |
: def rss entries, now, recent, title, link, description | |
<?xml version="1.0" encoding="utf-8"?> | |
<rss version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"> | |
<channel> | |
<title>${title}</title> | |
<link>${str(link)}</link> | |
<description>${description}</description> | |
<itunes:author>${entries[0].channel}</itunes:author> | |
<language>en-us</language> | |
<lastBuildDate>${now.strftime('%a, %d %b %Y %H:%M:%S +0000')}</lastBuildDate> | |
<pubDate>${recent.strftime('%a, %d %b %Y %H:%M:%S +0000')}</pubDate> | |
<category>TV & Film</category> | |
<generator>Marrow Cast</generator> | |
<ttl>60</ttl> | |
: for i, item in enumerate(entries) | |
<item> | |
<guid>${item.identifier}</guid> | |
<pubDate>#{item.uploaded.strftime('%a, %d %b %Y %H:%M:%S +0000')}</pubDate> | |
<title><![CDATA[#{item.title}]]></title> | |
<link><![CDATA[#{str(item.link)}]]></link> | |
<description><![CDATA[#{item.description}]]></description> | |
<enclosure url="#{str(item.video)}" length="${item.size}" type="video/mp4"></enclosure> | |
<itunes:author>${item.channel}</itunes:author> | |
<itunes:subtitle>${item.title}</itunes:subtitle> | |
<itunes:summary><![CDATA[#{item.description}]]></itunes:summary> | |
<itunes:image href="${str(item.thumbnail)}"></itunes:image> | |
<itunes:duration>#{item.duration}</itunes:duration> | |
<itunes:explicit>no</itunes:explicit> | |
<itunes:order>#{i}</itunes:order> | |
</item> | |
: end | |
</channel> | |
</rss> | |
: end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://github.com/nficano/pytube