Skip to content

Instantly share code, notes, and snippets.

@dcorney
Created August 21, 2024 08:32
Show Gist options
  • Save dcorney/fa2536523631d7132d1a48aa41ff32b2 to your computer and use it in GitHub Desktop.
Save dcorney/fa2536523631d7132d1a48aa41ff32b2 to your computer and use it in GitHub Desktop.
Getting video transcripts
import re
import json
import string
import requests
from langdetect import detect
def mostly_english(sentences):
"""Quick check to see if most sentences are probably English"""
en_count = 0
for s in sentences:
try:
en_count += detect(s["sentence_text"]) == "en"
except:
# ignore errors in language detection - can happen with empty/trivial strings
pass
return en_count > (0.5 * len(sentences))
def get_captions(video_id: str) -> dict:
"""Get captions from a given YouTube video. Selects English version if more than one is available."""
video_url = f"https://www.youtube.com/watch?v={video_id}"
r = requests.get(video_url, allow_redirects=False, timeout=60)
merged_transcripts = {}
try:
# Simple regex to get captions URL from XML-formatted response:
pat = re.compile(r"(https://www.youtube.com/api/timedtext\?.*?)\"")
captions_URLs = pat.findall(r.text)
for captions_URL in captions_URLs:
captions_URL = captions_URL.replace("\\u0026", "&")
r = requests.get(captions_URL, allow_redirects=False, timeout=60)
# extract timestamp and text for each sentence into a dict:
pat = re.compile(
r'<text start="(?P<start>[0-9\.]*?)" dur="[0-9\.]*?">(?P<sentence_text>[^<]*)<\/text>'
)
transcript = {
"video_id": video_id,
"sentences": [clean_str(m.groupdict()) for m in pat.finditer(r.text)],
}
if mostly_english(transcript["sentences"]) or len(merged_transcripts) == 0:
# keep English if we find them, else just the first set we find
merged_transcripts.update(transcript)
except Exception as e:
print(f"Failed to get captions from {video_id}")
print(e)
# return None
return merged_transcripts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment