iCUE-Solutions/chromadb.py

## chromadb.py
'''
Filename: /home/dirk/dev/iCUE/iCUE-SmartScribe/langchain.py
Path: /home/dirk/dev/iCUE/iCUE-SmartScribe
Created Date: Monday, April 24th 2023, 12:37:32 pm
Author: Dirk Liebich

Copyright (c) 2023 iCUE Solutions GmbH

Example
https://github.com/hwchase17/langchain/issues/2491

'''

import json
import logging
import os
import re
import sys
# import streamlit as st

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from fastapi.encoders import jsonable_encoder
from dotenv import load_dotenv

load_dotenv()
logging.basicConfig(level=logging.DEBUG)

# ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = 'db'
os.environ["OPENAI_API_KEY"] = "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"

def replace_newlines_and_spaces(text):
    # Replace all newline characters with spaces
    text = text.replace("\n", " ")
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text


def get_documents():
    loader = DirectoryLoader("transcript", glob="**/*.txt")
    docs = loader.load()
    return docs


def init_chromadb():
    # Delete existing index directory and recreate the directory
    if os.path.exists(DB_DIR):
        import shutil
        shutil.rmtree(DB_DIR, ignore_errors=True)
        os.mkdir(DB_DIR)

    documents = []
    for num, doc in enumerate(get_documents()):
        doc.page_content = replace_newlines_and_spaces(doc.page_content)
        documents.append(doc)

    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=0,
        length_function=len,
    )
    texts = text_splitter.split_documents(documents)

    # Select which embeddings we want to use
    embeddings = OpenAIEmbeddings()

    # Create the vectorestore to use as the index
    vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
    vectorstore.persist()
    # print(vectorstore)
    vectorstore = None

def query_chromadb():
    if not os.path.exists(DB_DIR):
        raise Exception(f"{DB_DIR} does not exist, nothing can be queried")

    # Select which embeddings we want to use
    embeddings = OpenAIEmbeddings()
    # Load Vector store from local disk
    vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)

    result = vectorstore.similarity_search_with_score(query="What is most discussed topic ", k=30)
    jsonable_result = jsonable_encoder(result)
    print(json.dumps(jsonable_result, indent=2), )

def main():
    init_chromadb()
    query_chromadb()

if __name__ == '__main__':
    main()
	'''
	Filename: /home/dirk/dev/iCUE/iCUE-SmartScribe/langchain.py
	Path: /home/dirk/dev/iCUE/iCUE-SmartScribe
	Created Date: Monday, April 24th 2023, 12:37:32 pm
	Author: Dirk Liebich

	Copyright (c) 2023 iCUE Solutions GmbH

	Example
	https://github.com/hwchase17/langchain/issues/2491

	'''

	import json
	import logging
	import os
	import re
	import sys
	# import streamlit as st

	from langchain.text_splitter import RecursiveCharacterTextSplitter

	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.document_loaders import PyPDFLoader
	from langchain.document_loaders import DirectoryLoader
	from fastapi.encoders import jsonable_encoder
	from dotenv import load_dotenv

	load_dotenv()
	logging.basicConfig(level=logging.DEBUG)

	# ABS_PATH = os.path.dirname(os.path.abspath(__file__))
	DB_DIR = 'db'
	os.environ["OPENAI_API_KEY"] = "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"

	def replace_newlines_and_spaces(text):
	# Replace all newline characters with spaces
	text = text.replace("\n", " ")
	# Replace multiple spaces with a single space
	text = re.sub(r'\s+', ' ', text)
	return text


	def get_documents():
	loader = DirectoryLoader("transcript", glob="*/.txt")
	docs = loader.load()
	return docs


	def init_chromadb():
	# Delete existing index directory and recreate the directory
	if os.path.exists(DB_DIR):
	import shutil
	shutil.rmtree(DB_DIR, ignore_errors=True)
	os.mkdir(DB_DIR)

	documents = []
	for num, doc in enumerate(get_documents()):
	doc.page_content = replace_newlines_and_spaces(doc.page_content)
	documents.append(doc)

	# Split the documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=3000,
	chunk_overlap=0,
	length_function=len,
	)
	texts = text_splitter.split_documents(documents)

	# Select which embeddings we want to use
	embeddings = OpenAIEmbeddings()

	# Create the vectorestore to use as the index
	vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
	vectorstore.persist()
	# print(vectorstore)
	vectorstore = None

	def query_chromadb():
	if not os.path.exists(DB_DIR):
	raise Exception(f"{DB_DIR} does not exist, nothing can be queried")

	# Select which embeddings we want to use
	embeddings = OpenAIEmbeddings()
	# Load Vector store from local disk
	vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)

	result = vectorstore.similarity_search_with_score(query="What is most discussed topic ", k=30)
	jsonable_result = jsonable_encoder(result)
	print(json.dumps(jsonable_result, indent=2), )

	def main():
	init_chromadb()
	query_chromadb()

	if __name__ == '__main__':
	main()