Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Sh1n0g1/d69db6dbc5c13ce887c23c6828658570 to your computer and use it in GitHub Desktop.
Save Sh1n0g1/d69db6dbc5c13ce887c23c6828658570 to your computer and use it in GitHub Desktop.
youtube-summarizer-with-langchain-chatgpt.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Sh1n0g1/d69db6dbc5c13ce887c23c6828658570/youtube-summarizer-with-langchain-chatgpt.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# How to use\n",
"* Input OpenAI API Key\n",
"* Input YouTube Video ID\n",
" * You can get the Video ID from the URL\n",
" * If URL is https://www.youtube.com/watch?v=PlQ4Y8knqvA, then the ID i s `PlQ4Y8knqvA`\n",
"* Configure the caption language (Default: en, ja)\n"
],
"metadata": {
"id": "N19SrdFqDsGv"
}
},
{
"cell_type": "code",
"source": [
"#@title 1. Settings\n",
"openai_api_key = \"\" #@param {type:\"string\"}\n",
"youtube_video_id = \"\" #@param {type:\"string\"}\n",
"#@markdown Specify the Language separated by comma (e.g. `en, ja`)\n",
"acceptable_caption_languages = \"en , ja \" #@param {type:\"string\"}\n",
"#@markdown If you want to get extra info about summarization, enable this\n",
"verbose = True #@param {type:\"boolean\"}\n",
"summary_language = \"Japanese\" #@param {type:\"string\"}\n",
"if summary_language==\"\":\n",
" summary_language=\"English\"\n",
"\n",
"languages=[]\n",
"for l in acceptable_caption_languages.split(\",\"):\n",
" languages.append(l.strip())\n",
"if len(languages)==0:\n",
" languages=[\"en\"]\n",
"\n",
" "
],
"metadata": {
"cellView": "form",
"id": "rlRFCK440YpK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "RRYSu48huSUW",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title 2. Install Module \n",
"!pip -q install langchain\n",
"!pip -q install openai\n",
"!pip -q install tiktoken\n",
"!pip -q install youtube-transcript-api\n"
]
},
{
"cell_type": "code",
"source": [
"#@title 3. Import Module\n",
"import os\n",
"import textwrap\n",
"\n",
"from langchain import OpenAI, PromptTemplate, LLMChain\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.chains.mapreduce import MapReduceChain\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.docstore.document import Document\n",
"from langchain.chains.summarize import load_summarize_chain\n",
"from langchain.callbacks import get_openai_callback\n",
"import tiktoken\n",
"\n",
"from youtube_transcript_api import YouTubeTranscriptApi\n"
],
"metadata": {
"cellView": "form",
"id": "JBHj00kg0Gqh"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 4. Get Youtube Caption\n",
"youtube_caption = YouTubeTranscriptApi.get_transcript(youtube_video_id, languages)\n",
"caption=\"\"\n",
"previous_caption_end_time=0\n",
"for c in youtube_caption:\n",
" space=c['start'] - previous_caption_end_time\n",
" if space > 3:\n",
" caption+=c['text'] + ',\\n'\n",
" else:\n",
" caption+=c['text'] + ' '\n",
" previous_caption_end_time = c['start'] \n",
"print(f\"{caption}\\n{'='*20}\")\n",
"print(f\"Length of Text:{len(caption)}\")\n",
"encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
"print(f\"Tokens: {len(encoding.encode(caption))}\")\n",
"\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, separator=\"\\n\")\n",
"texts = text_splitter.split_text(caption)\n",
"print(f\"{len(texts)} chunks created.\")\n",
"docs = [Document(page_content=t) for t in texts]"
],
"metadata": {
"cellView": "form",
"id": "RzL3WQro0TOg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "M5uYr_-TPKgR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 5. Use a Custom Prompt (Optional)\n",
"map_prompt_template =\"\"\"Write a consise summary of the following YouTube transcription:\n",
"###\n",
"{text}\n",
"\n",
"SUMMARY:\n",
"\n",
"\"\"\"\n",
"\n",
"final_prompt_template = \"Write a bullet point summary of the following in \" + summary_language + \"\"\":\n",
"###\n",
"{text}\n",
"\n",
"SUMMARY:\"\"\"\n",
"\n",
"map_prompt = PromptTemplate(template=map_prompt_template, \n",
" input_variables=[\"text\"])\n",
"final_prompt = PromptTemplate(template=final_prompt_template, \n",
" input_variables=[\"text\"])"
],
"metadata": {
"id": "YiDNATpDHdsz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 6. Get Summary\n",
"os.environ[\"OPENAI_API_KEY\"] = openai_api_key\n",
"llm = OpenAI(temperature=0)\n",
"with get_openai_callback() as cb:\n",
" if use_custom_prompt:\n",
" chain = load_summarize_chain(llm, chain_type=\"map_reduce\", verbose=verbose, map_prompt=map_prompt, combine_prompt=final_prompt)\n",
" else:\n",
" chain = load_summarize_chain(llm, chain_type=\"map_reduce\", verbose=verbose)\n",
" output_summary = chain.run(docs)\n",
"\n",
" \n",
" \n",
" print(cb)\n"
],
"metadata": {
"id": "h4nRP8oGO2qf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 7. Print Summary\n",
"\n",
"for t in output_summary.split(\"\\n\"):\n",
" print(t)"
],
"metadata": {
"id": "nbW7Yqp4OAlm"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment