waseigo/summarize_youtube_video_transcript.py

## summarize_youtube_video_transcript.py
import sys
import yt_dlp
import json
import openai

"""
Usage: python3 summarize_youtube_video_transcript.py "[YOUTUBE VIDEO URL]"
Example: python3 summarize_youtube_video_transcript.py "https://www.youtube.com/watch?v=1Gyjj8pyi_0"
"""

# Your OpenAI API key from here: https://beta.openai.com/account/api-keys"
OPENAI_KEY = None

URL = sys.argv[-1]


def download_subs(URL, auto=False):
    ydl_opts = {"skip_download": True, "subtitlesformat": "json3"}
    if auto:
        ydl_opts["writeautomaticsub"] = True
    else:
        ydl_opts["writesubtitles"] = True

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(URL)

    return info


def subs_exist(info):
    return info["requested_subtitles"] != None


def subs_to_json(info):
    if subs_exist(info):
        lang = list(info["requested_subtitles"].keys())[0]
        subs_fn = info["requested_subtitles"][lang]["filepath"]

        with open(subs_fn, "rb") as s:
            j = s.read()

        return json.loads(j)


def extract_subs_auto(sj):
    events = sj["events"]
    segs = [p["segs"] for p in events if "segs" in p]

    c1 = [p for p in segs if "utf8" in p[0].keys()]
    c2 = [[q["utf8"].strip() for q in p if "utf8" in q.keys()] for p in c1]
    c3 = [" ".join(p) for p in c2]
    transcript = " ".join([p for p in c3 if p != ""])
    return transcript


def extract_subs_nonauto(sj):
    events = sj["events"]
    lines = [e["segs"][0]["utf8"].replace("\n", " ") for e in events]
    return " ".join(lines)


def summarize_gpt(transcript):
    # Different prompt options:
    # PROMPT = "Derive the key takeaways from this text: "
    # PROMPT = "what is this text about?"
    PROMPT = "Summarize the following segment of a video transcript: "

    MODEL = "text-davinci-003"
    MAX_LENGTH = 4097

    # Split the transcript into segments based on the expectation that
    # the GPT-3 output will be 1/MAX_SUMMARIZATION_FACTOR
    # (slso a simple way to reduce the API calls)

    MAX_SUMMARIZATION_FACTOR = 3
    sentences = [s + "." for s in transcript.split(". ")]
    sentence_lengths = [len(s) for s in sentences]
    max_prompt_length = MAX_LENGTH - int(MAX_LENGTH / MAX_SUMMARIZATION_FACTOR)

    sentence_lengths_cum = [
        sum(sentence_lengths[:k]) for k in range(len(sentence_lengths))
    ]
    cmod = [p % (max_prompt_length - len(PROMPT)) for p in sentence_lengths_cum]
    cuts = [True] + [cmod[k - 1] > cmod[k] for k in range(1, len(cmod))]
    starts = [k for k in range(len(cuts)) if cuts[k]] + [len(cmod)]
    segments = [sentences[starts[k] : starts[k + 1]] for k in range(len(starts) - 1)]

    prompts = [PROMPT + " ".join(segment) for segment in segments]
    responses = (
        []
    )  # save responses in case we run this interactively and want to reuse them

    openai.api_key = OPENAI_KEY
    for prompt in prompts:
        max_tokens = MAX_LENGTH - len(prompt)
        response = openai.Completion.create(
            engine=MODEL,
            prompt=prompt,
            max_tokens=max_tokens,
            n=1,
            stop=None,
            temperature=0.5,
        )
        r = response["choices"][0]["text"]
        responses.append(r)
        print(r)

    return responses


info = download_subs(URL, auto=False)
if subs_exist(info):
    subs_json = subs_to_json(info)
    transcript = extract_subs_nonauto(subs_json)
else:
    info = download_subs(URL, auto=True)
    if subs_exist(info):
        subs_json = subs_to_json(info)
        transcript = extract_subs_auto(subs_json)
    else:
        exit()

print(transcript)

if OPENAI_KEY not in [None, ""] and len(transcript.split(".")) > 1:
    responses = summarize_gpt(transcript)
	import sys
	import yt_dlp
	import json
	import openai

	"""
	Usage: python3 summarize_youtube_video_transcript.py "[YOUTUBE VIDEO URL]"
	Example: python3 summarize_youtube_video_transcript.py "https://www.youtube.com/watch?v=1Gyjj8pyi_0"
	"""

	# Your OpenAI API key from here: https://beta.openai.com/account/api-keys"
	OPENAI_KEY = None

	URL = sys.argv[-1]


	def download_subs(URL, auto=False):
	ydl_opts = {"skip_download": True, "subtitlesformat": "json3"}
	if auto:
	ydl_opts["writeautomaticsub"] = True
	else:
	ydl_opts["writesubtitles"] = True

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(URL)

	return info


	def subs_exist(info):
	return info["requested_subtitles"] != None


	def subs_to_json(info):
	if subs_exist(info):
	lang = list(info["requested_subtitles"].keys())[0]
	subs_fn = info["requested_subtitles"][lang]["filepath"]

	with open(subs_fn, "rb") as s:
	j = s.read()

	return json.loads(j)


	def extract_subs_auto(sj):
	events = sj["events"]
	segs = [p["segs"] for p in events if "segs" in p]

	c1 = [p for p in segs if "utf8" in p[0].keys()]
	c2 = [[q["utf8"].strip() for q in p if "utf8" in q.keys()] for p in c1]
	c3 = [" ".join(p) for p in c2]
	transcript = " ".join([p for p in c3 if p != ""])
	return transcript


	def extract_subs_nonauto(sj):
	events = sj["events"]
	lines = [e["segs"][0]["utf8"].replace("\n", " ") for e in events]
	return " ".join(lines)


	def summarize_gpt(transcript):
	# Different prompt options:
	# PROMPT = "Derive the key takeaways from this text: "
	# PROMPT = "what is this text about?"
	PROMPT = "Summarize the following segment of a video transcript: "

	MODEL = "text-davinci-003"
	MAX_LENGTH = 4097

	# Split the transcript into segments based on the expectation that
	# the GPT-3 output will be 1/MAX_SUMMARIZATION_FACTOR
	# (slso a simple way to reduce the API calls)

	MAX_SUMMARIZATION_FACTOR = 3
	sentences = [s + "." for s in transcript.split(". ")]
	sentence_lengths = [len(s) for s in sentences]
	max_prompt_length = MAX_LENGTH - int(MAX_LENGTH / MAX_SUMMARIZATION_FACTOR)

	sentence_lengths_cum = [
	sum(sentence_lengths[:k]) for k in range(len(sentence_lengths))
	]
	cmod = [p % (max_prompt_length - len(PROMPT)) for p in sentence_lengths_cum]
	cuts = [True] + [cmod[k - 1] > cmod[k] for k in range(1, len(cmod))]
	starts = [k for k in range(len(cuts)) if cuts[k]] + [len(cmod)]
	segments = [sentences[starts[k] : starts[k + 1]] for k in range(len(starts) - 1)]

	prompts = [PROMPT + " ".join(segment) for segment in segments]
	responses = (
	[]
	) # save responses in case we run this interactively and want to reuse them

	openai.api_key = OPENAI_KEY
	for prompt in prompts:
	max_tokens = MAX_LENGTH - len(prompt)
	response = openai.Completion.create(
	engine=MODEL,
	prompt=prompt,
	max_tokens=max_tokens,
	n=1,
	stop=None,
	temperature=0.5,
	)
	r = response["choices"][0]["text"]
	responses.append(r)
	print(r)

	return responses


	info = download_subs(URL, auto=False)
	if subs_exist(info):
	subs_json = subs_to_json(info)
	transcript = extract_subs_nonauto(subs_json)
	else:
	info = download_subs(URL, auto=True)
	if subs_exist(info):
	subs_json = subs_to_json(info)
	transcript = extract_subs_auto(subs_json)
	else:
	exit()

	print(transcript)

	if OPENAI_KEY not in [None, ""] and len(transcript.split(".")) > 1:
	responses = summarize_gpt(transcript)