Skip to content

Instantly share code, notes, and snippets.

@waseigo
Last active January 28, 2023 21:10
Show Gist options
  • Save waseigo/a5224237cd40b625908570aaa58a5ca1 to your computer and use it in GitHub Desktop.
Save waseigo/a5224237cd40b625908570aaa58a5ca1 to your computer and use it in GitHub Desktop.
Summarize the transcript of a YouTube video using OpenAI's GPT-3
import sys
import yt_dlp
import json
import openai
"""
Usage: python3 summarize_youtube_video_transcript.py "[YOUTUBE VIDEO URL]"
Example: python3 summarize_youtube_video_transcript.py "https://www.youtube.com/watch?v=1Gyjj8pyi_0"
"""
# Your OpenAI API key from here: https://beta.openai.com/account/api-keys"
OPENAI_KEY = None
URL = sys.argv[-1]
def download_subs(URL, auto=False):
ydl_opts = {"skip_download": True, "subtitlesformat": "json3"}
if auto:
ydl_opts["writeautomaticsub"] = True
else:
ydl_opts["writesubtitles"] = True
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(URL)
return info
def subs_exist(info):
return info["requested_subtitles"] != None
def subs_to_json(info):
if subs_exist(info):
lang = list(info["requested_subtitles"].keys())[0]
subs_fn = info["requested_subtitles"][lang]["filepath"]
with open(subs_fn, "rb") as s:
j = s.read()
return json.loads(j)
def extract_subs_auto(sj):
events = sj["events"]
segs = [p["segs"] for p in events if "segs" in p]
c1 = [p for p in segs if "utf8" in p[0].keys()]
c2 = [[q["utf8"].strip() for q in p if "utf8" in q.keys()] for p in c1]
c3 = [" ".join(p) for p in c2]
transcript = " ".join([p for p in c3 if p != ""])
return transcript
def extract_subs_nonauto(sj):
events = sj["events"]
lines = [e["segs"][0]["utf8"].replace("\n", " ") for e in events]
return " ".join(lines)
def summarize_gpt(transcript):
# Different prompt options:
# PROMPT = "Derive the key takeaways from this text: "
# PROMPT = "what is this text about?"
PROMPT = "Summarize the following segment of a video transcript: "
MODEL = "text-davinci-003"
MAX_LENGTH = 4097
# Split the transcript into segments based on the expectation that
# the GPT-3 output will be 1/MAX_SUMMARIZATION_FACTOR
# (slso a simple way to reduce the API calls)
MAX_SUMMARIZATION_FACTOR = 3
sentences = [s + "." for s in transcript.split(". ")]
sentence_lengths = [len(s) for s in sentences]
max_prompt_length = MAX_LENGTH - int(MAX_LENGTH / MAX_SUMMARIZATION_FACTOR)
sentence_lengths_cum = [
sum(sentence_lengths[:k]) for k in range(len(sentence_lengths))
]
cmod = [p % (max_prompt_length - len(PROMPT)) for p in sentence_lengths_cum]
cuts = [True] + [cmod[k - 1] > cmod[k] for k in range(1, len(cmod))]
starts = [k for k in range(len(cuts)) if cuts[k]] + [len(cmod)]
segments = [sentences[starts[k] : starts[k + 1]] for k in range(len(starts) - 1)]
prompts = [PROMPT + " ".join(segment) for segment in segments]
responses = (
[]
) # save responses in case we run this interactively and want to reuse them
openai.api_key = OPENAI_KEY
for prompt in prompts:
max_tokens = MAX_LENGTH - len(prompt)
response = openai.Completion.create(
engine=MODEL,
prompt=prompt,
max_tokens=max_tokens,
n=1,
stop=None,
temperature=0.5,
)
r = response["choices"][0]["text"]
responses.append(r)
print(r)
return responses
info = download_subs(URL, auto=False)
if subs_exist(info):
subs_json = subs_to_json(info)
transcript = extract_subs_nonauto(subs_json)
else:
info = download_subs(URL, auto=True)
if subs_exist(info):
subs_json = subs_to_json(info)
transcript = extract_subs_auto(subs_json)
else:
exit()
print(transcript)
if OPENAI_KEY not in [None, ""] and len(transcript.split(".")) > 1:
responses = summarize_gpt(transcript)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment