Created
June 7, 2023 08:46
-
-
Save onepointconsulting/7dace39330595d9e733c24fca0848ded to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from collections import Counter | |
from langchain.chat_models import ChatOpenAI | |
from langchain.schema import ( | |
HumanMessage, | |
SystemMessage | |
) | |
import pandas as pd | |
model_name = "gpt-3.5-turbo" | |
# Configuration | |
# Put here your API key | |
# os.environ["OPENAI_API_KEY"] = '' | |
def extract_data(json_content): | |
""" | |
Extracts the title, descripton, id and youtube id from the json object. | |
:param json_content: a JSON object with the video metadata | |
:return: a tuple with the title, descripton, id and youtube id | |
""" | |
return (json_content['base']['title'], json_content['base']['description'], json_content['id'], json_content['youtube_id']) | |
def process_all_records(batch_size: int = 100): | |
""" | |
Generator function which loops through all videos via a REST interface. | |
:param batch_size: The size of the batch retrieved via the REST interface call. | |
""" | |
start = 0 | |
while True: | |
response = requests.get(f"https://admin.thelighthouse.world/videos?_limit={batch_size}&_start={start}") | |
json_content = response.json() | |
for jc in json_content: | |
# Pass the content to the caller using the generator with yield | |
yield extract_data(jc) | |
if len(json_content) < batch_size: | |
break | |
start += batch_size | |
def extract_keywords(text): | |
""" | |
Extracts the keywords from the ChatGPT generated text. | |
:param text: The answer from ChatGPT, like 'Keywords: Golden Heart, Beating, 50th Anniversary, Promo, Promotion video.' | |
""" | |
text = text.lower() | |
expression = r".*keywords:(.+?)$" | |
if re.search(expression, text): | |
keywords = re.sub(expression, r"\1", text, flags=re.S) | |
if keywords is not None and len(keywords) > 0: | |
return [re.sub(r"\.$", "", k.strip()) for k in keywords.strip().split(',')] | |
return [] | |
def write_to_excel(popular_keywords, keyword_data): | |
""" | |
Captures the keywords of each record in one file and then the overall keyword count in another one. | |
:param popular_keywords: The keyword counter | |
:param keyword_data: Contains the record data and the extracted keywords | |
""" | |
pd.DataFrame(keyword_data).to_excel('keyword_info.xlsx') | |
keyword_data = [{'keyword': e[0], 'count': e[1]} for e in popular_keywords.most_common()] | |
pd.DataFrame(keyword_data).sort_values(by=['count'], ascending=False).to_excel('popular_keywords.xlsx') | |
def process_keywords(): | |
""" | |
Instantiates the object which interfaces with ChatGPT and loops through the records | |
capturing the keywords for each records and also counting the occurrence of each of these keywords. | |
""" | |
chat = ChatOpenAI(model_name=model_name, temperature=0) | |
popular_keywords = Counter() | |
keyword_data = [] | |
for i, record_data in enumerate(process_all_records()): | |
try: | |
dt_single, answer = extract_keywords_from_chat(chat, record_data) | |
extracted_keywords = extract_keywords(answer) | |
popular_keywords.update(extracted_keywords) | |
print(i, dt_single, popular_keywords) | |
keyword_data.append({'id': record_data[2], 'youtube_id': record_data[3], 'title': record_data[0], 'description': record_data[1], | |
'keywords': ','.join(extracted_keywords)}) | |
except Exception as e: | |
print(f"Error occurred: {e}") | |
write_to_excel(popular_keywords, keyword_data) | |
def extract_keywords_from_chat(chat, record_data): | |
""" | |
Sends a chat question to ChatGPT and returns its output. | |
:param chat: The object which communicates under the hood with ChatGPT. | |
:param record_data: The tuple with the title, description, id and youtube id | |
""" | |
dt_single = f"{record_data[0]} {record_data[1]}" | |
resp = chat([ | |
SystemMessage(content= | |
"You extract the main keywords in the text and extract these into a comma separated list. Please prefix the keywords with 'Keywords:'"), | |
HumanMessage(content=dt_single) | |
]) | |
answer = resp.content | |
return dt_single,answer | |
if __name__ == "__main__": | |
process_keywords() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment