Skip to content

Instantly share code, notes, and snippets.

@daskol
Created June 18, 2023 11:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daskol/4d7784070dea32043fce557a194066bb to your computer and use it in GitHub Desktop.
Save daskol/4d7784070dea32043fce557a194066bb to your computer and use it in GitHub Desktop.
Fetch and parses auto subtitles from YouTube video
#!/usr/bin/env python
from argparse import ArgumentParser, Namespace
from datetime import datetime
from json import dump
from os import rename
from pathlib import Path
from urllib.parse import parse_qs, urlparse
from xml.sax import ContentHandler
from xml.sax import parse as parse_xml
from yt_dlp import YoutubeDL
from yt_dlp.utils import sanitize_filename
TPL_FILENAME = '{title:s} [{id:s}].en.ttml'
YDL_OPTS = {
'skip_download': True,
'subtitlesformat': 'ttml',
'subtitleslangs': ['en'],
'writeautomaticsub': True,
'outtmpl': {
'default': '%(title)s [%(id)s].%(ext)s'
},
}
parser = ArgumentParser()
parser.add_argument('--cache-dir', default=Path('cache'), type=Path)
parser.add_argument('--output-dir', default=Path('output'), type=Path)
parser.add_argument('url', type=str, help='url to render to text')
def fetch(url: str, cache_dir: Path):
url_parsed = urlparse(url)
if len(url_ids := parse_qs(url_parsed.query).get('v', [])) == 0:
raise ValueError('Wrong YouTube video URL format.')
url_id = url_ids[-1]
info_filename = f'{url_id}.json'
info_path = cache_dir / info_filename
opts = {**YDL_OPTS, 'paths': {'default': str(cache_dir)}}
opts['outtmpl']['default'] = str(cache_dir / opts['outtmpl']['default'])
with YoutubeDL(opts) as ydl:
info = ydl.extract_info(url, download=False)
with open(info_path, 'w') as fout:
obj = ydl.sanitize_info(info)
dump(obj, fout, ensure_ascii=False, indent=None)
ydl.download_with_info_file(info_path)
prefix = datetime \
.fromisoformat(info.get('upload_date', '19700101')) \
.strftime('%Y-%m-%d ')
filename = sanitize_filename(TPL_FILENAME.format(**info))
src = cache_dir / filename
dst = cache_dir / f'{prefix}{filename}'
rename(src, dst)
return url_id, dst
# TODO(@daskol): No support for <span> element.
class Handler(ContentHandler):
def __init__(self):
self.in_p = 0
self.segments = []
@property
def content(self) -> str:
return ' '.join(self.segments)
def characters(self, content: str):
if self.in_p > 0:
self.segments.append(content)
def endElement(self, name):
if name == 'p':
self.in_p -= 1
def startElement(self, name, attrs):
if name == 'p':
self.in_p += 1
def render(path: str) -> str:
handler = Handler()
with open(path) as fin:
parse_xml(fin, handler)
return handler.content
def run(url: str, cache_dir: Path, output_dir: Path):
cache_dir.mkdir(exist_ok=True, parents=True)
_, path = fetch(url, cache_dir)
content = render(path)
filename = path.with_suffix('.txt').name
output_dir.mkdir(exist_ok=True, parents=True)
with open(output_dir / filename, 'w') as fout:
fout.write(content)
def main(args: Namespace):
run(args.url, args.cache_dir, args.output_dir)
if __name__ == '__main__':
main(parser.parse_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment