zoharbabin/kaltura_llama_esearch_object.py

## kaltura_llama_esearch_object.py
import logging
import sys
from llama_index import (
    download_loader,
    GPTVectorStoreIndex,
    LLMPredictor,
    ServiceContext
)
from langchain.llms import OpenAI
from KalturaClient.Plugins.Core import KalturaMediaType
from KalturaClient.Plugins.ElasticSearch import (
    KalturaESearchSortOrder, KalturaESearchEntryOrderByFieldName,
    KalturaESearchOrderBy, KalturaESearchEntryOrderByItem, KalturaESearchCaptionItem,
    KalturaESearchEntryItem, KalturaESearchEntryFieldName, KalturaESearchCaptionFieldName,
    KalturaESearchEntryParams, KalturaESearchCategoryEntryItem, KalturaESearchEntryOperator,
    KalturaESearchOperatorType, KalturaESearchItemType, KalturaCategoryEntryStatus, KalturaESearchCategoryEntryFieldName
)

## Kaltura credentials
PARTNER_ID: int = 0
API_SECRET: str = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
USER_ID: str = "LlamaTester"
KS_TYPE: int = 2
KS_EXPIRY: int = 86400
KS_PRIVILEGES: str = "disableentitlement"
KALTURA_API_ENDPOINT: str = "https://cdnapi-ev.kaltura.com/"
REQUEST_TIMEOUT: int = 500
SHOULD_LOG_API_CALLS: bool = True
MAX_ENTRIES = 1 # how many entries to load (pageSize)
CATEGORY_NAME_TO_FILTER: str = "categoryname" # <-- replace this to your category name

logging.basicConfig(stream=sys.stdout, level=logging.WARN)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

KalturaESearchReader = download_loader(loader_class="KalturaESearchReader",
                                       custom_path="../llama-hub/loader_hub",
                                       loader_hub_url="https://raw.githubusercontent.com/zoharbabin/llama-hub/main/loader_hub/")

reader = KalturaESearchReader(
    partner_id=PARTNER_ID,
    api_secret=API_SECRET,
    user_id=USER_ID,
    ks_type=KS_TYPE,
    ks_expiry=KS_EXPIRY,
    ks_privileges=KS_PRIVILEGES,
    kaltura_api_endpoint=KALTURA_API_ENDPOINT,
    request_timeout=REQUEST_TIMEOUT,
    should_log_api_calls=SHOULD_LOG_API_CALLS
)  # type: ignore KalturaESearchReader

search_params = KalturaESearchEntryParams()
# Sort the search results in descending order by entry last updated
search_params.orderBy = KalturaESearchOrderBy()
search_params.orderBy.orderItems = []
search_params.orderBy.orderItems.append(KalturaESearchEntryOrderByItem())
search_params.orderBy.orderItems[0].sortField = KalturaESearchEntryOrderByFieldName.UPDATED_AT
search_params.orderBy.orderItems[0].sortOrder = KalturaESearchSortOrder.ORDER_BY_DESC
# Create an AND relationship between the following search queries -
search_params.searchOperator = KalturaESearchEntryOperator()
search_params.searchOperator.operator = KalturaESearchOperatorType.AND_OP
search_params.searchOperator.searchItems = []
# Find only entries that have captions -
caption_item = KalturaESearchCaptionItem()
caption_item.fieldName = KalturaESearchCaptionFieldName.CONTENT
caption_item.itemType = KalturaESearchItemType.EXISTS
search_params.searchOperator.searchItems.append(caption_item)
# Find only entries that are inside an exact category name -
category_item = KalturaESearchCategoryEntryItem()
category_item.categoryEntryStatus = KalturaCategoryEntryStatus.ACTIVE
category_item.fieldName = KalturaESearchCategoryEntryFieldName.NAME
category_item.addHighlight = False
category_item.itemType = KalturaESearchItemType.EXACT_MATCH
category_item.searchTerm = CATEGORY_NAME_TO_FILTER
search_params.searchOperator.searchItems.append(category_item)
# Find only video entries (KalturaMediaType.VIDEO)
entry_item = KalturaESearchEntryItem()
entry_item.fieldName = KalturaESearchEntryFieldName.MEDIA_TYPE
entry_item.addHighlight = False
entry_item.itemType = KalturaESearchItemType.EXACT_MATCH
entry_item.searchTerm = KalturaMediaType.VIDEO
search_params.searchOperator.searchItems.append(entry_item)

entry_docs = reader.load_data(search_params, True, 5)
#langchain_documents = [d.to_langchain_format() for d in entry_docs]

# LLM Predictor (gpt-3.5-turbo) + service context
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003", streaming=True))
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor
)
index = GPTVectorStoreIndex.from_documents(entry_docs, service_context=service_context)
query_engine = index.as_query_engine(
    streaming=True,
    similarity_top_k=10
)
request = "the top 5 video segments where the speaker discusses the future of events in education"
response_stream = query_engine.query("Provide a json formatted response of the following: " + request +
                              ". Your json response should look like so: {startTime: xxx, endTime:zzz, speakerName: zzz, keywords: nnn} ." +
                              "startTime represents the time in the video this segment begins. " +
                              "endTime represents the time in the video this segment ends and another topic begins (segment can be multiple lines long). " +
                              "speakerName represents the primary speaker talking in that segment. " +
                              "keyword represents a one-word description of the segment as a title of that segment. " )
response_stream.print_response_stream()

## kaltura_llama_txt_simple.py
import logging
import sys
from llama_index import GPTVectorStoreIndex, download_loader

# Kaltura credentials
PARTNER_ID: int = 0
API_SECRET: str = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
USER_ID: str = "LlamaTester"
KS_TYPE: int = 2
KS_EXPIRY: int = 86400
KS_PRIVILEGES: str = "disableentitlement"
KALTURA_API_ENDPOINT: str = "https://cdnapi-ev.kaltura.com/"
REQUEST_TIMEOUT: int = 500
SHOULD_LOG_API_CALLS: bool = True
MAX_ENTRIES = 1 # how many entries to load (pageSize)
CATEGORY_IDS_TO_FILTER: str = "123,56,6846" # <-- replace this to your categories

logging.basicConfig(stream=sys.stdout, level=logging.WARN)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

KalturaESearchReader = download_loader(loader_class="KalturaESearchReader",
                                       custom_path="../llama-hub/loader_hub",
                                       loader_hub_url="https://raw.githubusercontent.com/zoharbabin/llama-hub/main/loader_hub/")

reader = KalturaESearchReader(
    partner_id=PARTNER_ID,
    api_secret=API_SECRET,
    user_id=USER_ID,
    ks_type=KS_TYPE,
    ks_expiry=KS_EXPIRY,
    ks_privileges=KS_PRIVILEGES,
    kaltura_api_endpoint=KALTURA_API_ENDPOINT,
    request_timeout=REQUEST_TIMEOUT,
    should_log_api_calls=SHOULD_LOG_API_CALLS
)  # type: ignore KalturaESearchReader

entry_docs = reader.load_data(search_operator_and=True,
                              free_text="education",
                              category_ids=None,
                              with_captions=True,
                              max_entries=5)
#pprint(entries)

index = GPTVectorStoreIndex.from_documents(entry_docs)
query_engine = index.as_query_engine()
request = "the top 5 video segments where the speaker discusses the future of events in education"
response = query_engine.query("Provide a json formatted response of the following: " + request +
                              ". Your json response should look like so: {startTime: xxx, endTime:zzz, speakerName: zzz, keywords: nnn} ." +
                              "startTime represents the time in the video this segment begins. " +
                              "endTime represents the time in the video this segment ends and another topic begins (segment can be multiple lines long). " +
                              "speakerName represents the primary speaker talking in that segment. " +
                              "keyword represents a one-word description of the segment as a title of that segment. " )
print(response)
	import logging
	import sys
	from llama_index import (
	download_loader,
	GPTVectorStoreIndex,
	LLMPredictor,
	ServiceContext
	)
	from langchain.llms import OpenAI
	from KalturaClient.Plugins.Core import KalturaMediaType
	from KalturaClient.Plugins.ElasticSearch import (
	KalturaESearchSortOrder, KalturaESearchEntryOrderByFieldName,
	KalturaESearchOrderBy, KalturaESearchEntryOrderByItem, KalturaESearchCaptionItem,
	KalturaESearchEntryItem, KalturaESearchEntryFieldName, KalturaESearchCaptionFieldName,
	KalturaESearchEntryParams, KalturaESearchCategoryEntryItem, KalturaESearchEntryOperator,
	KalturaESearchOperatorType, KalturaESearchItemType, KalturaCategoryEntryStatus, KalturaESearchCategoryEntryFieldName
	)

	## Kaltura credentials
	PARTNER_ID: int = 0
	API_SECRET: str = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
	USER_ID: str = "LlamaTester"
	KS_TYPE: int = 2
	KS_EXPIRY: int = 86400
	KS_PRIVILEGES: str = "disableentitlement"
	KALTURA_API_ENDPOINT: str = "https://cdnapi-ev.kaltura.com/"
	REQUEST_TIMEOUT: int = 500
	SHOULD_LOG_API_CALLS: bool = True
	MAX_ENTRIES = 1 # how many entries to load (pageSize)
	CATEGORY_NAME_TO_FILTER: str = "categoryname" # <-- replace this to your category name

	logging.basicConfig(stream=sys.stdout, level=logging.WARN)
	logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

	KalturaESearchReader = download_loader(loader_class="KalturaESearchReader",
	custom_path="../llama-hub/loader_hub",
	loader_hub_url="https://raw.githubusercontent.com/zoharbabin/llama-hub/main/loader_hub/")

	reader = KalturaESearchReader(
	partner_id=PARTNER_ID,
	api_secret=API_SECRET,
	user_id=USER_ID,
	ks_type=KS_TYPE,
	ks_expiry=KS_EXPIRY,
	ks_privileges=KS_PRIVILEGES,
	kaltura_api_endpoint=KALTURA_API_ENDPOINT,
	request_timeout=REQUEST_TIMEOUT,
	should_log_api_calls=SHOULD_LOG_API_CALLS
	) # type: ignore KalturaESearchReader

	search_params = KalturaESearchEntryParams()
	# Sort the search results in descending order by entry last updated
	search_params.orderBy = KalturaESearchOrderBy()
	search_params.orderBy.orderItems = []
	search_params.orderBy.orderItems.append(KalturaESearchEntryOrderByItem())
	search_params.orderBy.orderItems[0].sortField = KalturaESearchEntryOrderByFieldName.UPDATED_AT
	search_params.orderBy.orderItems[0].sortOrder = KalturaESearchSortOrder.ORDER_BY_DESC
	# Create an AND relationship between the following search queries -
	search_params.searchOperator = KalturaESearchEntryOperator()
	search_params.searchOperator.operator = KalturaESearchOperatorType.AND_OP
	search_params.searchOperator.searchItems = []
	# Find only entries that have captions -
	caption_item = KalturaESearchCaptionItem()
	caption_item.fieldName = KalturaESearchCaptionFieldName.CONTENT
	caption_item.itemType = KalturaESearchItemType.EXISTS
	search_params.searchOperator.searchItems.append(caption_item)
	# Find only entries that are inside an exact category name -
	category_item = KalturaESearchCategoryEntryItem()
	category_item.categoryEntryStatus = KalturaCategoryEntryStatus.ACTIVE
	category_item.fieldName = KalturaESearchCategoryEntryFieldName.NAME
	category_item.addHighlight = False
	category_item.itemType = KalturaESearchItemType.EXACT_MATCH
	category_item.searchTerm = CATEGORY_NAME_TO_FILTER
	search_params.searchOperator.searchItems.append(category_item)
	# Find only video entries (KalturaMediaType.VIDEO)
	entry_item = KalturaESearchEntryItem()
	entry_item.fieldName = KalturaESearchEntryFieldName.MEDIA_TYPE
	entry_item.addHighlight = False
	entry_item.itemType = KalturaESearchItemType.EXACT_MATCH
	entry_item.searchTerm = KalturaMediaType.VIDEO
	search_params.searchOperator.searchItems.append(entry_item)

	entry_docs = reader.load_data(search_params, True, 5)
	#langchain_documents = [d.to_langchain_format() for d in entry_docs]

	# LLM Predictor (gpt-3.5-turbo) + service context
	llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003", streaming=True))
	service_context = ServiceContext.from_defaults(
	llm_predictor=llm_predictor
	)
	index = GPTVectorStoreIndex.from_documents(entry_docs, service_context=service_context)
	query_engine = index.as_query_engine(
	streaming=True,
	similarity_top_k=10
	)
	request = "the top 5 video segments where the speaker discusses the future of events in education"
	response_stream = query_engine.query("Provide a json formatted response of the following: " + request +
	". Your json response should look like so: {startTime: xxx, endTime:zzz, speakerName: zzz, keywords: nnn} ." +
	"startTime represents the time in the video this segment begins. " +
	"endTime represents the time in the video this segment ends and another topic begins (segment can be multiple lines long). " +
	"speakerName represents the primary speaker talking in that segment. " +
	"keyword represents a one-word description of the segment as a title of that segment. " )
	response_stream.print_response_stream()
	import logging
	import sys
	from llama_index import GPTVectorStoreIndex, download_loader

	# Kaltura credentials
	PARTNER_ID: int = 0
	API_SECRET: str = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
	USER_ID: str = "LlamaTester"
	KS_TYPE: int = 2
	KS_EXPIRY: int = 86400
	KS_PRIVILEGES: str = "disableentitlement"
	KALTURA_API_ENDPOINT: str = "https://cdnapi-ev.kaltura.com/"
	REQUEST_TIMEOUT: int = 500
	SHOULD_LOG_API_CALLS: bool = True
	MAX_ENTRIES = 1 # how many entries to load (pageSize)
	CATEGORY_IDS_TO_FILTER: str = "123,56,6846" # <-- replace this to your categories

	logging.basicConfig(stream=sys.stdout, level=logging.WARN)
	logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

	KalturaESearchReader = download_loader(loader_class="KalturaESearchReader",
	custom_path="../llama-hub/loader_hub",
	loader_hub_url="https://raw.githubusercontent.com/zoharbabin/llama-hub/main/loader_hub/")

	reader = KalturaESearchReader(
	partner_id=PARTNER_ID,
	api_secret=API_SECRET,
	user_id=USER_ID,
	ks_type=KS_TYPE,
	ks_expiry=KS_EXPIRY,
	ks_privileges=KS_PRIVILEGES,
	kaltura_api_endpoint=KALTURA_API_ENDPOINT,
	request_timeout=REQUEST_TIMEOUT,
	should_log_api_calls=SHOULD_LOG_API_CALLS
	) # type: ignore KalturaESearchReader

	entry_docs = reader.load_data(search_operator_and=True,
	free_text="education",
	category_ids=None,
	with_captions=True,
	max_entries=5)
	#pprint(entries)

	index = GPTVectorStoreIndex.from_documents(entry_docs)
	query_engine = index.as_query_engine()
	request = "the top 5 video segments where the speaker discusses the future of events in education"
	response = query_engine.query("Provide a json formatted response of the following: " + request +
	". Your json response should look like so: {startTime: xxx, endTime:zzz, speakerName: zzz, keywords: nnn} ." +
	"startTime represents the time in the video this segment begins. " +
	"endTime represents the time in the video this segment ends and another topic begins (segment can be multiple lines long). " +
	"speakerName represents the primary speaker talking in that segment. " +
	"keyword represents a one-word description of the segment as a title of that segment. " )
	print(response)