Created
August 21, 2024 08:32
-
-
Save dcorney/fa2536523631d7132d1a48aa41ff32b2 to your computer and use it in GitHub Desktop.
Getting video transcripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import json | |
import string | |
import requests | |
from langdetect import detect | |
def mostly_english(sentences): | |
"""Quick check to see if most sentences are probably English""" | |
en_count = 0 | |
for s in sentences: | |
try: | |
en_count += detect(s["sentence_text"]) == "en" | |
except: | |
# ignore errors in language detection - can happen with empty/trivial strings | |
pass | |
return en_count > (0.5 * len(sentences)) | |
def get_captions(video_id: str) -> dict: | |
"""Get captions from a given YouTube video. Selects English version if more than one is available.""" | |
video_url = f"https://www.youtube.com/watch?v={video_id}" | |
r = requests.get(video_url, allow_redirects=False, timeout=60) | |
merged_transcripts = {} | |
try: | |
# Simple regex to get captions URL from XML-formatted response: | |
pat = re.compile(r"(https://www.youtube.com/api/timedtext\?.*?)\"") | |
captions_URLs = pat.findall(r.text) | |
for captions_URL in captions_URLs: | |
captions_URL = captions_URL.replace("\\u0026", "&") | |
r = requests.get(captions_URL, allow_redirects=False, timeout=60) | |
# extract timestamp and text for each sentence into a dict: | |
pat = re.compile( | |
r'<text start="(?P<start>[0-9\.]*?)" dur="[0-9\.]*?">(?P<sentence_text>[^<]*)<\/text>' | |
) | |
transcript = { | |
"video_id": video_id, | |
"sentences": [clean_str(m.groupdict()) for m in pat.finditer(r.text)], | |
} | |
if mostly_english(transcript["sentences"]) or len(merged_transcripts) == 0: | |
# keep English if we find them, else just the first set we find | |
merged_transcripts.update(transcript) | |
except Exception as e: | |
print(f"Failed to get captions from {video_id}") | |
print(e) | |
# return None | |
return merged_transcripts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment