Last active
January 28, 2023 21:10
-
-
Save waseigo/a5224237cd40b625908570aaa58a5ca1 to your computer and use it in GitHub Desktop.
Summarize the transcript of a YouTube video using OpenAI's GPT-3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import yt_dlp | |
import json | |
import openai | |
""" | |
Usage: python3 summarize_youtube_video_transcript.py "[YOUTUBE VIDEO URL]" | |
Example: python3 summarize_youtube_video_transcript.py "https://www.youtube.com/watch?v=1Gyjj8pyi_0" | |
""" | |
# Your OpenAI API key from here: https://beta.openai.com/account/api-keys" | |
OPENAI_KEY = None | |
URL = sys.argv[-1] | |
def download_subs(URL, auto=False): | |
ydl_opts = {"skip_download": True, "subtitlesformat": "json3"} | |
if auto: | |
ydl_opts["writeautomaticsub"] = True | |
else: | |
ydl_opts["writesubtitles"] = True | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info = ydl.extract_info(URL) | |
return info | |
def subs_exist(info): | |
return info["requested_subtitles"] != None | |
def subs_to_json(info): | |
if subs_exist(info): | |
lang = list(info["requested_subtitles"].keys())[0] | |
subs_fn = info["requested_subtitles"][lang]["filepath"] | |
with open(subs_fn, "rb") as s: | |
j = s.read() | |
return json.loads(j) | |
def extract_subs_auto(sj): | |
events = sj["events"] | |
segs = [p["segs"] for p in events if "segs" in p] | |
c1 = [p for p in segs if "utf8" in p[0].keys()] | |
c2 = [[q["utf8"].strip() for q in p if "utf8" in q.keys()] for p in c1] | |
c3 = [" ".join(p) for p in c2] | |
transcript = " ".join([p for p in c3 if p != ""]) | |
return transcript | |
def extract_subs_nonauto(sj): | |
events = sj["events"] | |
lines = [e["segs"][0]["utf8"].replace("\n", " ") for e in events] | |
return " ".join(lines) | |
def summarize_gpt(transcript): | |
# Different prompt options: | |
# PROMPT = "Derive the key takeaways from this text: " | |
# PROMPT = "what is this text about?" | |
PROMPT = "Summarize the following segment of a video transcript: " | |
MODEL = "text-davinci-003" | |
MAX_LENGTH = 4097 | |
# Split the transcript into segments based on the expectation that | |
# the GPT-3 output will be 1/MAX_SUMMARIZATION_FACTOR | |
# (slso a simple way to reduce the API calls) | |
MAX_SUMMARIZATION_FACTOR = 3 | |
sentences = [s + "." for s in transcript.split(". ")] | |
sentence_lengths = [len(s) for s in sentences] | |
max_prompt_length = MAX_LENGTH - int(MAX_LENGTH / MAX_SUMMARIZATION_FACTOR) | |
sentence_lengths_cum = [ | |
sum(sentence_lengths[:k]) for k in range(len(sentence_lengths)) | |
] | |
cmod = [p % (max_prompt_length - len(PROMPT)) for p in sentence_lengths_cum] | |
cuts = [True] + [cmod[k - 1] > cmod[k] for k in range(1, len(cmod))] | |
starts = [k for k in range(len(cuts)) if cuts[k]] + [len(cmod)] | |
segments = [sentences[starts[k] : starts[k + 1]] for k in range(len(starts) - 1)] | |
prompts = [PROMPT + " ".join(segment) for segment in segments] | |
responses = ( | |
[] | |
) # save responses in case we run this interactively and want to reuse them | |
openai.api_key = OPENAI_KEY | |
for prompt in prompts: | |
max_tokens = MAX_LENGTH - len(prompt) | |
response = openai.Completion.create( | |
engine=MODEL, | |
prompt=prompt, | |
max_tokens=max_tokens, | |
n=1, | |
stop=None, | |
temperature=0.5, | |
) | |
r = response["choices"][0]["text"] | |
responses.append(r) | |
print(r) | |
return responses | |
info = download_subs(URL, auto=False) | |
if subs_exist(info): | |
subs_json = subs_to_json(info) | |
transcript = extract_subs_nonauto(subs_json) | |
else: | |
info = download_subs(URL, auto=True) | |
if subs_exist(info): | |
subs_json = subs_to_json(info) | |
transcript = extract_subs_auto(subs_json) | |
else: | |
exit() | |
print(transcript) | |
if OPENAI_KEY not in [None, ""] and len(transcript.split(".")) > 1: | |
responses = summarize_gpt(transcript) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment