Skip to content

Instantly share code, notes, and snippets.

@amyreese
Last active May 22, 2024 00:56
Show Gist options
  • Save amyreese/1749bd9f700e5ffe53b631f5e0d1dfd1 to your computer and use it in GitHub Desktop.
Save amyreese/1749bd9f700e5ffe53b631f5e0d1dfd1 to your computer and use it in GitHub Desktop.
# Copyright Amethyst Reese
# Licensed under the MIT License
import json
import re
import shutil
import sys
from dataclasses import asdict, dataclass, field
from datetime import date, datetime
from pathlib import Path
from pprint import pprint
"""
Fetch conference videos from a YT playlist and generate pyvideo metadata files.
$ yt-dlp --skip-download --write-info-json <PLAYLIST-URL>
$ python3.12 pyvideo-converter.py <path/to/yt-dlp/output/> [<path/to/pyvideo/data/>]
"""
DROP_FIRST_LINES = 4
DROP_LAST_LINES = 8
SLUG_RE = re.compile(r"(\W+)")
QUOTED_TITLE_RE = re.compile(r'"(.+)" - ([\w\s"\',-]+) ')
TIMESTAMP_RE = re.compile(r"^\d+:\d\d - ")
BASE = Path.cwd().resolve()
def clamp[T](value: T, lower: T, upper: T) -> T:
return min(max(value, lower), upper)
def parse_date(
value: str, format: str = r"%Y-%m-%d", default: date = date.today()
) -> date:
if not value:
return default
return datetime.strptime(value, format).date()
class Sluggable:
title: str
@property
def slug(self):
slug = SLUG_RE.sub("-", self.title.lower()).strip("-")
return "-".join(slug.split("-")[:10])
@dataclass
class Conference(Sluggable):
title: str = "North Bay Python 2023"
playlist_url: str = (
"https://www.youtube.com/playlist?list=PLaeNpBNgqQWtmPtFhkhBzLlMVjwr9mivC"
)
start: date = parse_date("2023-07-29")
end: date = parse_date("2023-07-30")
@dataclass
class Talk(Sluggable):
title: str = ""
description: str = ""
speakers: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
language: str = "eng"
recorded: date = date.today()
duration: int = 0
copyright_text: str = ""
related_urls: list[dict[str, str]] = field(default_factory=list)
videos: list[dict[str, str]] = field(default_factory=list)
thumbnail_url: str = ""
def main(yt_info_dir: Path, pyvideo_data_dir: Path) -> None:
conf = Conference()
talks: list[Talk] = []
for path in sorted(yt_info_dir.glob("*.json")):
print(f"loading {path}")
raw = json.loads(path.read_text())
if raw["_type"] != "video":
print(f"skipping {raw['_type']}")
continue
talk = Talk(
title=raw["title"],
description="\n".join(
line
for line in raw["description"].splitlines()[
DROP_FIRST_LINES:-DROP_LAST_LINES
]
if not TIMESTAMP_RE.match(line)
),
recorded=str(
clamp(
parse_date(raw["upload_date"], r"%Y%m%d", conf.start),
conf.start,
conf.end,
)
),
duration=raw["duration"],
videos=[
{
"type": "youtube",
"url": raw["webpage_url"],
}
],
thumbnail_url=raw["thumbnail"],
)
if match := QUOTED_TITLE_RE.search(talk.title):
title, speaker = match.groups()
talk.title = title
talk.speakers.extend(s.strip() for s in speaker.split(","))
talks.append(talk)
conf_dir = pyvideo_data_dir / conf.slug
shutil.rmtree(conf_dir, ignore_errors=True)
conf_dir.mkdir(parents=True, exist_ok=True)
(conf_dir / "category.json").write_text(
json.dumps(
{"title": conf.title},
indent=2,
sort_keys=True,
)
+ "\n"
)
video_dir = conf_dir / "videos"
video_dir.mkdir(parents=True, exist_ok=True)
for talk in talks:
path = (video_dir / talk.slug).with_suffix(".json")
print(f"writing {path}")
pprint(asdict(talk))
path.write_text(
json.dumps(
asdict(talk),
indent=2,
sort_keys=True,
)
+ "\n"
)
if __name__ == "__main__":
print(f"{sys.argv=!r}")
_, y, *d = sys.argv
yt_info_dir = Path(y).resolve()
data_dir = Path((d or (Path.cwd(),))[0]).resolve()
main(yt_info_dir, data_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment