Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save onepointconsulting/7dace39330595d9e733c24fca0848ded to your computer and use it in GitHub Desktop.
Save onepointconsulting/7dace39330595d9e733c24fca0848ded to your computer and use it in GitHub Desktop.
import requests
import re
from collections import Counter
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
HumanMessage,
SystemMessage
)
import pandas as pd
model_name = "gpt-3.5-turbo"
# Configuration
# Put here your API key
# os.environ["OPENAI_API_KEY"] = ''
def extract_data(json_content):
"""
Extracts the title, descripton, id and youtube id from the json object.
:param json_content: a JSON object with the video metadata
:return: a tuple with the title, descripton, id and youtube id
"""
return (json_content['base']['title'], json_content['base']['description'], json_content['id'], json_content['youtube_id'])
def process_all_records(batch_size: int = 100):
"""
Generator function which loops through all videos via a REST interface.
:param batch_size: The size of the batch retrieved via the REST interface call.
"""
start = 0
while True:
response = requests.get(f"https://admin.thelighthouse.world/videos?_limit={batch_size}&_start={start}")
json_content = response.json()
for jc in json_content:
# Pass the content to the caller using the generator with yield
yield extract_data(jc)
if len(json_content) < batch_size:
break
start += batch_size
def extract_keywords(text):
"""
Extracts the keywords from the ChatGPT generated text.
:param text: The answer from ChatGPT, like 'Keywords: Golden Heart, Beating, 50th Anniversary, Promo, Promotion video.'
"""
text = text.lower()
expression = r".*keywords:(.+?)$"
if re.search(expression, text):
keywords = re.sub(expression, r"\1", text, flags=re.S)
if keywords is not None and len(keywords) > 0:
return [re.sub(r"\.$", "", k.strip()) for k in keywords.strip().split(',')]
return []
def write_to_excel(popular_keywords, keyword_data):
"""
Captures the keywords of each record in one file and then the overall keyword count in another one.
:param popular_keywords: The keyword counter
:param keyword_data: Contains the record data and the extracted keywords
"""
pd.DataFrame(keyword_data).to_excel('keyword_info.xlsx')
keyword_data = [{'keyword': e[0], 'count': e[1]} for e in popular_keywords.most_common()]
pd.DataFrame(keyword_data).sort_values(by=['count'], ascending=False).to_excel('popular_keywords.xlsx')
def process_keywords():
"""
Instantiates the object which interfaces with ChatGPT and loops through the records
capturing the keywords for each records and also counting the occurrence of each of these keywords.
"""
chat = ChatOpenAI(model_name=model_name, temperature=0)
popular_keywords = Counter()
keyword_data = []
for i, record_data in enumerate(process_all_records()):
try:
dt_single, answer = extract_keywords_from_chat(chat, record_data)
extracted_keywords = extract_keywords(answer)
popular_keywords.update(extracted_keywords)
print(i, dt_single, popular_keywords)
keyword_data.append({'id': record_data[2], 'youtube_id': record_data[3], 'title': record_data[0], 'description': record_data[1],
'keywords': ','.join(extracted_keywords)})
except Exception as e:
print(f"Error occurred: {e}")
write_to_excel(popular_keywords, keyword_data)
def extract_keywords_from_chat(chat, record_data):
"""
Sends a chat question to ChatGPT and returns its output.
:param chat: The object which communicates under the hood with ChatGPT.
:param record_data: The tuple with the title, description, id and youtube id
"""
dt_single = f"{record_data[0]} {record_data[1]}"
resp = chat([
SystemMessage(content=
"You extract the main keywords in the text and extract these into a comma separated list. Please prefix the keywords with 'Keywords:'"),
HumanMessage(content=dt_single)
])
answer = resp.content
return dt_single,answer
if __name__ == "__main__":
process_keywords()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment