Last active
May 22, 2024 00:56
-
-
Save amyreese/1749bd9f700e5ffe53b631f5e0d1dfd1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright Amethyst Reese | |
# Licensed under the MIT License | |
import json | |
import re | |
import shutil | |
import sys | |
from dataclasses import asdict, dataclass, field | |
from datetime import date, datetime | |
from pathlib import Path | |
from pprint import pprint | |
""" | |
Fetch conference videos from a YT playlist and generate pyvideo metadata files. | |
$ yt-dlp --skip-download --write-info-json <PLAYLIST-URL> | |
$ python3.12 pyvideo-converter.py <path/to/yt-dlp/output/> [<path/to/pyvideo/data/>] | |
""" | |
DROP_FIRST_LINES = 4 | |
DROP_LAST_LINES = 8 | |
SLUG_RE = re.compile(r"(\W+)") | |
QUOTED_TITLE_RE = re.compile(r'"(.+)" - ([\w\s"\',-]+) ') | |
TIMESTAMP_RE = re.compile(r"^\d+:\d\d - ") | |
BASE = Path.cwd().resolve() | |
def clamp[T](value: T, lower: T, upper: T) -> T: | |
return min(max(value, lower), upper) | |
def parse_date( | |
value: str, format: str = r"%Y-%m-%d", default: date = date.today() | |
) -> date: | |
if not value: | |
return default | |
return datetime.strptime(value, format).date() | |
class Sluggable: | |
title: str | |
@property | |
def slug(self): | |
slug = SLUG_RE.sub("-", self.title.lower()).strip("-") | |
return "-".join(slug.split("-")[:10]) | |
@dataclass | |
class Conference(Sluggable): | |
title: str = "North Bay Python 2023" | |
playlist_url: str = ( | |
"https://www.youtube.com/playlist?list=PLaeNpBNgqQWtmPtFhkhBzLlMVjwr9mivC" | |
) | |
start: date = parse_date("2023-07-29") | |
end: date = parse_date("2023-07-30") | |
@dataclass | |
class Talk(Sluggable): | |
title: str = "" | |
description: str = "" | |
speakers: list[str] = field(default_factory=list) | |
tags: list[str] = field(default_factory=list) | |
language: str = "eng" | |
recorded: date = date.today() | |
duration: int = 0 | |
copyright_text: str = "" | |
related_urls: list[dict[str, str]] = field(default_factory=list) | |
videos: list[dict[str, str]] = field(default_factory=list) | |
thumbnail_url: str = "" | |
def main(yt_info_dir: Path, pyvideo_data_dir: Path) -> None: | |
conf = Conference() | |
talks: list[Talk] = [] | |
for path in sorted(yt_info_dir.glob("*.json")): | |
print(f"loading {path}") | |
raw = json.loads(path.read_text()) | |
if raw["_type"] != "video": | |
print(f"skipping {raw['_type']}") | |
continue | |
talk = Talk( | |
title=raw["title"], | |
description="\n".join( | |
line | |
for line in raw["description"].splitlines()[ | |
DROP_FIRST_LINES:-DROP_LAST_LINES | |
] | |
if not TIMESTAMP_RE.match(line) | |
), | |
recorded=str( | |
clamp( | |
parse_date(raw["upload_date"], r"%Y%m%d", conf.start), | |
conf.start, | |
conf.end, | |
) | |
), | |
duration=raw["duration"], | |
videos=[ | |
{ | |
"type": "youtube", | |
"url": raw["webpage_url"], | |
} | |
], | |
thumbnail_url=raw["thumbnail"], | |
) | |
if match := QUOTED_TITLE_RE.search(talk.title): | |
title, speaker = match.groups() | |
talk.title = title | |
talk.speakers.extend(s.strip() for s in speaker.split(",")) | |
talks.append(talk) | |
conf_dir = pyvideo_data_dir / conf.slug | |
shutil.rmtree(conf_dir, ignore_errors=True) | |
conf_dir.mkdir(parents=True, exist_ok=True) | |
(conf_dir / "category.json").write_text( | |
json.dumps( | |
{"title": conf.title}, | |
indent=2, | |
sort_keys=True, | |
) | |
+ "\n" | |
) | |
video_dir = conf_dir / "videos" | |
video_dir.mkdir(parents=True, exist_ok=True) | |
for talk in talks: | |
path = (video_dir / talk.slug).with_suffix(".json") | |
print(f"writing {path}") | |
pprint(asdict(talk)) | |
path.write_text( | |
json.dumps( | |
asdict(talk), | |
indent=2, | |
sort_keys=True, | |
) | |
+ "\n" | |
) | |
if __name__ == "__main__": | |
print(f"{sys.argv=!r}") | |
_, y, *d = sys.argv | |
yt_info_dir = Path(y).resolve() | |
data_dir = Path((d or (Path.cwd(),))[0]).resolve() | |
main(yt_info_dir, data_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment