dcorney/yt_transcripts.py

## yt_transcripts.py
import re
import json
import string
import requests
from langdetect import detect


def mostly_english(sentences):
    """Quick check to see if most sentences are probably English"""
    en_count = 0
    for s in sentences:
        try:
            en_count += detect(s["sentence_text"]) == "en"
        except:
            # ignore errors in language detection - can happen with empty/trivial strings
            pass
    return en_count > (0.5 * len(sentences))


def get_captions(video_id: str) -> dict:
    """Get captions from a given YouTube video. Selects English version if more than one is available."""

    video_url = f"https://www.youtube.com/watch?v={video_id}"
    r = requests.get(video_url, allow_redirects=False, timeout=60)
    merged_transcripts = {}

    try:
        # Simple regex to get captions URL from XML-formatted response:
        pat = re.compile(r"(https://www.youtube.com/api/timedtext\?.*?)\"")
        captions_URLs = pat.findall(r.text)
        for captions_URL in captions_URLs:
            captions_URL = captions_URL.replace("\\u0026", "&")

            r = requests.get(captions_URL, allow_redirects=False, timeout=60)

            # extract timestamp and text for each sentence into a dict:
            pat = re.compile(
                r'<text start="(?P<start>[0-9\.]*?)" dur="[0-9\.]*?">(?P<sentence_text>[^<]*)<\/text>'
            )
            transcript = {
                "video_id": video_id,
                "sentences": [clean_str(m.groupdict()) for m in pat.finditer(r.text)],
            }
            if mostly_english(transcript["sentences"]) or len(merged_transcripts) == 0:
                # keep English if we find them, else just the first set we find
                merged_transcripts.update(transcript)
    except Exception as e:
        print(f"Failed to get captions from {video_id}")
        print(e)
        # return None
    return merged_transcripts
	import re
	import json
	import string
	import requests
	from langdetect import detect


	def mostly_english(sentences):
	"""Quick check to see if most sentences are probably English"""
	en_count = 0
	for s in sentences:
	try:
	en_count += detect(s["sentence_text"]) == "en"
	except:
	# ignore errors in language detection - can happen with empty/trivial strings
	pass
	return en_count > (0.5 * len(sentences))


	def get_captions(video_id: str) -> dict:
	"""Get captions from a given YouTube video. Selects English version if more than one is available."""

	video_url = f"https://www.youtube.com/watch?v={video_id}"
	r = requests.get(video_url, allow_redirects=False, timeout=60)
	merged_transcripts = {}

	try:
	# Simple regex to get captions URL from XML-formatted response:
	pat = re.compile(r"(https://www.youtube.com/api/timedtext\?.*?)\"")
	captions_URLs = pat.findall(r.text)
	for captions_URL in captions_URLs:
	captions_URL = captions_URL.replace("\\u0026", "&")

	r = requests.get(captions_URL, allow_redirects=False, timeout=60)

	# extract timestamp and text for each sentence into a dict:
	pat = re.compile(
	r'<text start="(?P<start>[0-9\.]?)" dur="[0-9\.]?">(?P<sentence_text>[^<]*)<\/text>'
	)
	transcript = {
	"video_id": video_id,
	"sentences": [clean_str(m.groupdict()) for m in pat.finditer(r.text)],
	}
	if mostly_english(transcript["sentences"]) or len(merged_transcripts) == 0:
	# keep English if we find them, else just the first set we find
	merged_transcripts.update(transcript)
	except Exception as e:
	print(f"Failed to get captions from {video_id}")
	print(e)
	# return None
	return merged_transcripts