onepointconsulting/langchain_keyword_extractor.py

## langchain_keyword_extractor.py
import requests
import re
from collections import Counter

from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    HumanMessage,
    SystemMessage
)

import pandas as pd


model_name = "gpt-3.5-turbo"

# Configuration
# Put here your API key
# os.environ["OPENAI_API_KEY"] = ''


def extract_data(json_content):
    """
    Extracts the title, descripton, id and youtube id from the json object.

    :param json_content: a JSON object with the video metadata
    :return: a tuple with the title, descripton, id and youtube id
    """
    return (json_content['base']['title'], json_content['base']['description'], json_content['id'], json_content['youtube_id'])


def process_all_records(batch_size: int = 100):
    """
    Generator function which loops through all videos via a REST interface.

    :param batch_size: The size of the batch retrieved via the REST interface call.
    """
    start = 0
    while True:
        response = requests.get(f"https://admin.thelighthouse.world/videos?_limit={batch_size}&_start={start}")
        json_content = response.json()
        for jc in json_content:
            # Pass the content to the caller using the generator with yield
            yield extract_data(jc)
        if len(json_content) < batch_size:
            break
        start += batch_size


def extract_keywords(text):
    """
    Extracts the keywords from the ChatGPT generated text.

    :param text: The answer from ChatGPT, like 'Keywords: Golden Heart, Beating, 50th Anniversary, Promo, Promotion video.'
    """
    text = text.lower()
    expression = r".*keywords:(.+?)$"
    if re.search(expression, text):
        keywords = re.sub(expression, r"\1", text, flags=re.S)
        if keywords is not None and len(keywords) > 0:
            return [re.sub(r"\.$", "", k.strip()) for k in keywords.strip().split(',')]
    return []


def write_to_excel(popular_keywords, keyword_data):
    """
    Captures the keywords of each record in one file and then the overall keyword count in another one.

    :param popular_keywords: The keyword counter
    :param keyword_data: Contains the record data and the extracted keywords
    """
    pd.DataFrame(keyword_data).to_excel('keyword_info.xlsx')
    keyword_data = [{'keyword': e[0], 'count': e[1]} for e in popular_keywords.most_common()]
    pd.DataFrame(keyword_data).sort_values(by=['count'], ascending=False).to_excel('popular_keywords.xlsx')


def process_keywords():
    """
    Instantiates the object which interfaces with ChatGPT and loops through the records
    capturing the keywords for each records and also counting the occurrence of each of these keywords.
    """
    chat = ChatOpenAI(model_name=model_name, temperature=0)
    popular_keywords = Counter()
    keyword_data = []
    for i, record_data in enumerate(process_all_records()):
        try:
            dt_single, answer = extract_keywords_from_chat(chat, record_data)
            extracted_keywords = extract_keywords(answer)
            popular_keywords.update(extracted_keywords)
            print(i, dt_single, popular_keywords)
            keyword_data.append({'id': record_data[2], 'youtube_id': record_data[3], 'title': record_data[0], 'description': record_data[1],
                                'keywords': ','.join(extracted_keywords)})
        except Exception as e:
            print(f"Error occurred: {e}")

    write_to_excel(popular_keywords, keyword_data)


def extract_keywords_from_chat(chat, record_data):
    """
    Sends a chat question to ChatGPT and returns its output.

    :param chat: The object which communicates under the hood with ChatGPT.
    :param record_data: The tuple with the title, description, id and youtube id
    """
    dt_single = f"{record_data[0]} {record_data[1]}"
    resp = chat([
                SystemMessage(content=
                        "You extract the main keywords in the text and extract these into a comma separated list. Please prefix the keywords with 'Keywords:'"),
                HumanMessage(content=dt_single)
            ])
    answer = resp.content
    return dt_single,answer


if __name__ == "__main__":
    process_keywords()
	import requests
	import re
	from collections import Counter

	from langchain.chat_models import ChatOpenAI
	from langchain.schema import (
	HumanMessage,
	SystemMessage
	)

	import pandas as pd


	model_name = "gpt-3.5-turbo"

	# Configuration
	# Put here your API key
	# os.environ["OPENAI_API_KEY"] = ''


	def extract_data(json_content):
	"""
	Extracts the title, descripton, id and youtube id from the json object.

	:param json_content: a JSON object with the video metadata
	:return: a tuple with the title, descripton, id and youtube id
	"""
	return (json_content['base']['title'], json_content['base']['description'], json_content['id'], json_content['youtube_id'])


	def process_all_records(batch_size: int = 100):
	"""
	Generator function which loops through all videos via a REST interface.

	:param batch_size: The size of the batch retrieved via the REST interface call.
	"""
	start = 0
	while True:
	response = requests.get(f"https://admin.thelighthouse.world/videos?_limit={batch_size}&_start={start}")
	json_content = response.json()
	for jc in json_content:
	# Pass the content to the caller using the generator with yield
	yield extract_data(jc)
	if len(json_content) < batch_size:
	break
	start += batch_size


	def extract_keywords(text):
	"""
	Extracts the keywords from the ChatGPT generated text.

	:param text: The answer from ChatGPT, like 'Keywords: Golden Heart, Beating, 50th Anniversary, Promo, Promotion video.'
	"""
	text = text.lower()
	expression = r".*keywords:(.+?)$"
	if re.search(expression, text):
	keywords = re.sub(expression, r"\1", text, flags=re.S)
	if keywords is not None and len(keywords) > 0:
	return [re.sub(r"\.$", "", k.strip()) for k in keywords.strip().split(',')]
	return []


	def write_to_excel(popular_keywords, keyword_data):
	"""
	Captures the keywords of each record in one file and then the overall keyword count in another one.

	:param popular_keywords: The keyword counter
	:param keyword_data: Contains the record data and the extracted keywords
	"""
	pd.DataFrame(keyword_data).to_excel('keyword_info.xlsx')
	keyword_data = [{'keyword': e[0], 'count': e[1]} for e in popular_keywords.most_common()]
	pd.DataFrame(keyword_data).sort_values(by=['count'], ascending=False).to_excel('popular_keywords.xlsx')


	def process_keywords():
	"""
	Instantiates the object which interfaces with ChatGPT and loops through the records
	capturing the keywords for each records and also counting the occurrence of each of these keywords.
	"""
	chat = ChatOpenAI(model_name=model_name, temperature=0)
	popular_keywords = Counter()
	keyword_data = []
	for i, record_data in enumerate(process_all_records()):
	try:
	dt_single, answer = extract_keywords_from_chat(chat, record_data)
	extracted_keywords = extract_keywords(answer)
	popular_keywords.update(extracted_keywords)
	print(i, dt_single, popular_keywords)
	keyword_data.append({'id': record_data[2], 'youtube_id': record_data[3], 'title': record_data[0], 'description': record_data[1],
	'keywords': ','.join(extracted_keywords)})
	except Exception as e:
	print(f"Error occurred: {e}")

	write_to_excel(popular_keywords, keyword_data)


	def extract_keywords_from_chat(chat, record_data):
	"""
	Sends a chat question to ChatGPT and returns its output.

	:param chat: The object which communicates under the hood with ChatGPT.
	:param record_data: The tuple with the title, description, id and youtube id
	"""
	dt_single = f"{record_data[0]} {record_data[1]}"
	resp = chat([
	SystemMessage(content=
	"You extract the main keywords in the text and extract these into a comma separated list. Please prefix the keywords with 'Keywords:'"),
	HumanMessage(content=dt_single)
	])
	answer = resp.content
	return dt_single,answer


	if __name__ == "__main__":
	process_keywords()