Created
May 29, 2022 18:23
-
-
Save manurueda/8b7b4aa5b06b8fbfd844bd35df801bab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import os | |
import time | |
import xml.etree.ElementTree as ElementTree | |
from html import unescape | |
from typing import Dict, Optional | |
from pytube import request | |
from pytube.helpers import safe_filename, target_directory | |
class Caption: | |
"""Container for caption tracks.""" | |
def __init__(self, caption_track: Dict): | |
"""Construct a :class:`Caption <Caption>`. | |
:param dict caption_track: | |
Caption track data extracted from ``watch_html``. | |
""" | |
self.url = caption_track.get("baseUrl") | |
# Certain videos have runs instead of simpleText | |
# this handles that edge case | |
name_dict = caption_track['name'] | |
if 'simpleText' in name_dict: | |
self.name = name_dict['simpleText'] | |
else: | |
for el in name_dict['runs']: | |
if 'text' in el: | |
self.name = el['text'] | |
# Use "vssId" instead of "languageCode", fix issue #779 | |
self.code = caption_track["vssId"] | |
# Remove preceding '.' for backwards compatibility, e.g.: | |
# English -> vssId: .en, languageCode: en | |
# English (auto-generated) -> vssId: a.en, languageCode: en | |
self.code = self.code.strip('.') | |
@property | |
def xml_captions(self) -> str: | |
"""Download the xml caption tracks.""" | |
return request.get(self.url) | |
def generate_srt_captions(self) -> str: | |
"""Generate "SubRip Subtitle" captions. | |
Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and | |
recompiles them into the "SubRip Subtitle" format. | |
""" | |
return self.xml_caption_to_srt(self.xml_captions) | |
@staticmethod | |
def float_to_srt_time_format(d: float) -> str: | |
"""Convert decimal durations into proper srt format. | |
:rtype: str | |
:returns: | |
SubRip Subtitle (str) formatted time duration. | |
float_to_srt_time_format(3.89) -> '00:00:03,890' | |
""" | |
fraction, whole = math.modf(d) | |
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole)) | |
ms = f"{fraction:.3f}".replace("0.", "") | |
return time_fmt + ms | |
def xml_caption_to_srt(self, xml_captions: str) -> str: | |
"""Convert xml caption tracks to "SubRip Subtitle (srt)". | |
:param str xml_captions: | |
XML formatted caption tracks. | |
""" | |
segments = [] | |
root = ElementTree.fromstring(xml_captions) | |
for i, child in enumerate(list(root)): | |
text = child.text or "" | |
caption = unescape(text.replace("\n", " ").replace(" ", " "),) | |
try: | |
duration = float(child.attrib["dur"]) | |
except KeyError: | |
duration = 0.0 | |
start = float(child.attrib["start"]) | |
end = start + duration | |
sequence_number = i + 1 # convert from 0-indexed to 1. | |
line = "{seq}\n{start} --> {end}\n{text}\n".format( | |
seq=sequence_number, | |
start=self.float_to_srt_time_format(start), | |
end=self.float_to_srt_time_format(end), | |
text=caption, | |
) | |
segments.append(line) | |
return "\n".join(segments).strip() | |
def download( | |
self, | |
title: str, | |
srt: bool = True, | |
output_path: Optional[str] = None, | |
filename_prefix: Optional[str] = None, | |
) -> str: | |
"""Write the media stream to disk. | |
:param title: | |
Output filename (stem only) for writing media file. | |
If one is not specified, the default filename is used. | |
:type title: str | |
:param srt: | |
Set to True to download srt, false to download xml. Defaults to True. | |
:type srt bool | |
:param output_path: | |
(optional) Output path for writing media file. If one is not | |
specified, defaults to the current working directory. | |
:type output_path: str or None | |
:param filename_prefix: | |
(optional) A string that will be prepended to the filename. | |
For example a number in a playlist or the name of a series. | |
If one is not specified, nothing will be prepended | |
This is separate from filename so you can use the default | |
filename but still add a prefix. | |
:type filename_prefix: str or None | |
:rtype: str | |
""" | |
if title.endswith(".srt") or title.endswith(".xml"): | |
filename = ".".join(title.split(".")[:-1]) | |
else: | |
filename = title | |
if filename_prefix: | |
filename = f"{safe_filename(filename_prefix)}{filename}" | |
filename = safe_filename(filename) | |
filename += f" ({self.code})" | |
if srt: | |
filename += ".srt" | |
else: | |
filename += ".xml" | |
file_path = os.path.join(target_directory(output_path), filename) | |
with open(file_path, "w", encoding="utf-8") as file_handle: | |
if srt: | |
file_handle.write(self.generate_srt_captions()) | |
else: | |
file_handle.write(self.xml_captions) | |
return file_path | |
def __repr__(self): | |
"""Printable object representation.""" | |
return '<Caption lang="{s.name}" code="{s.code}">'.format(s=self) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment