/clsVideoContentScrapper.py Secret
Created
May 29, 2023 00:00
Star
You must be signed in to star a gist
Main class to extract the transcript from the YouTube videos & then answer the questions based on the topics selected by the users.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##################################################### | |
#### Written By: SATYAKI DE #### | |
#### Written On: 27-May-2023 #### | |
#### Modified On 28-May-2023 #### | |
#### #### | |
#### Objective: This is the main calling #### | |
#### python class that will invoke the #### | |
#### LangChain of package to extract #### | |
#### the transcript from the YouTube videos & #### | |
#### then answer the questions based on the #### | |
#### topics selected by the users. #### | |
#### #### | |
##################################################### | |
from langchain.document_loaders import YoutubeLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.chat_models import ChatOpenAI | |
from langchain.chains import LLMChain | |
from langchain.prompts.chat import ( | |
ChatPromptTemplate, | |
SystemMessagePromptTemplate, | |
HumanMessagePromptTemplate, | |
) | |
from googleapiclient.discovery import build | |
import clsTemplate as ct | |
from clsConfigClient import clsConfigClient as cf | |
import os | |
############################################### | |
### Global Section ### | |
############################################### | |
open_ai_Key = cf.conf['OPEN_AI_KEY'] | |
os.environ["OPENAI_API_KEY"] = open_ai_Key | |
embeddings = OpenAIEmbeddings(openai_api_key=open_ai_Key) | |
YouTube_Key = cf.conf['YOUTUBE_KEY'] | |
youtube = build('youtube', 'v3', developerKey=YouTube_Key) | |
# Disbling Warning | |
def warn(*args, **kwargs): | |
pass | |
import warnings | |
warnings.warn = warn | |
############################################### | |
### End of Global Section ### | |
############################################### | |
class clsVideoContentScrapper: | |
def __init__(self): | |
self.model_name = cf.conf['MODEL_NAME'] | |
self.temp_val = cf.conf['TEMP_VAL'] | |
self.max_cnt = int(cf.conf['MAX_CNT']) | |
def createDBFromYoutubeVideoUrl(self, video_url): | |
try: | |
loader = YoutubeLoader.from_youtube_url(video_url) | |
transcript = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
docs = text_splitter.split_documents(transcript) | |
db = FAISS.from_documents(docs, embeddings) | |
return db | |
except Exception as e: | |
x = str(e) | |
print('Error: ', x) | |
return '' | |
def getResponseFromQuery(self, db, query, k=4): | |
try: | |
""" | |
gpt-3.5-turbo can handle up to 4097 tokens. Setting the chunksize to 1000 and k to 4 maximizes | |
the number of tokens to analyze. | |
""" | |
mod_name = self.model_name | |
temp_val = self.temp_val | |
docs = db.similarity_search(query, k=k) | |
docs_page_content = " ".join([d.page_content for d in docs]) | |
chat = ChatOpenAI(model_name=mod_name, temperature=temp_val) | |
# Template to use for the system message prompt | |
template = ct.templateVal_1 | |
system_message_prompt = SystemMessagePromptTemplate.from_template(template) | |
# Human question prompt | |
human_template = "Answer the following question: {question}" | |
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) | |
chat_prompt = ChatPromptTemplate.from_messages( | |
[system_message_prompt, human_message_prompt] | |
) | |
chain = LLMChain(llm=chat, prompt=chat_prompt) | |
response = chain.run(question=query, docs=docs_page_content) | |
response = response.replace("\n", "") | |
return response, docs | |
except Exception as e: | |
x = str(e) | |
print('Error: ', x) | |
return '', '' | |
def topFiveURLFromYouTube(self, service, **kwargs): | |
try: | |
video_urls = [] | |
channel_list = [] | |
results = service.search().list(**kwargs).execute() | |
for item in results['items']: | |
print("Title: ", item['snippet']['title']) | |
print("Description: ", item['snippet']['description']) | |
channel = item['snippet']['channelId'] | |
print("Channel Id: ", channel) | |
# Fetch the channel name using the channel ID | |
channel_response = service.channels().list(part='snippet',id=item['snippet']['channelId']).execute() | |
channel_title = channel_response['items'][0]['snippet']['title'] | |
print("Channel Title: ", channel_title) | |
channel_list.append(channel_title) | |
print("Video Id: ", item['id']['videoId']) | |
vidURL = "https://www.youtube.com/watch?v=" + item['id']['videoId'] | |
print("Video URL: " + vidURL) | |
video_urls.append(vidURL) | |
print("\n") | |
return video_urls, channel_list | |
except Exception as e: | |
video_urls = [] | |
channel_list = [] | |
x = str(e) | |
print('Error: ', x) | |
return video_urls, channel_list | |
def extractContentInText(self, topic, query): | |
try: | |
discussedTopic = [] | |
strKeyText = '' | |
cnt = 0 | |
max_cnt = self.max_cnt | |
urlList, channelList = self.topFiveURLFromYouTube(youtube, q=topic, part='id,snippet',maxResults=max_cnt,type='video') | |
print('Returned List: ') | |
print(urlList) | |
print() | |
for video_url in urlList: | |
print('Processing Video: ') | |
print(video_url) | |
db = self.createDBFromYoutubeVideoUrl(video_url) | |
response, docs = self.getResponseFromQuery(db, query) | |
if len(response) > 0: | |
strKeyText = 'As per the topic discussed in ' + channelList[cnt] + ', ' | |
discussedTopic.append(strKeyText + response) | |
cnt += 1 | |
return discussedTopic | |
except Exception as e: | |
discussedTopic = [] | |
x = str(e) | |
print('Error: ', x) | |
return discussedTopic |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment