janduplessis883/01_Embedding_Data_From_A_Pandas_DataFrame_Chroma_LangChain_Ollama.py

## 01_Embedding_Data_From_A_Pandas_DataFrame_Chroma_LangChain_Ollama.py
import pandas as pd
from langchain.schema import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from tqdm import tqdm

## 02_embed_model.py
# Initialize the embedding model
embedding_model = OllamaEmbeddings(model="nomic-embed-text", show_progress=False)

# Initialize Chroma Vector Store (this assumes that you do not need to from_documents here directly)
# Assuming vector_db needs to be setup only once
vector_db = Chroma(collection_name="GP_Surgery_Reviews")

## 03_embed_function.py
def embed_with_chroma(df, embedding_model):
    embeddings = []

    # Process each row in the DataFrame with a progress bar
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        # Create a Document with necessary fields
        document = Document(
            page_content=row['review'],  # Text content for embedding
            meta_data={'pcn': row['pcn'], 'surgery': row['surgery']},  # Additional meta-data
            id=str(row['index'])  # Unique identifier as string
        )

        # Generate embedding using the correct embedding method
        try:
            # As 'embed_documents' expects a list of documents, we pass a list with one document
            # and then take the first (and only) embedding from the returned list
            embedding = embedding_model.embed_documents([document.page_content])[0]
            embeddings.append((document, embedding))
        except Exception as e:
            print(f"Failed to embed document: {e}")

    return embeddings

## 05_prepare_df.py
# Example DataFrame
data = {
    'index': [1, 2, 3],
    'review': ['Great service!', 'Needs improvement.', 'Very satisfied.'],
    'pcn': ['PCN123', 'PCN456', 'PCN789'],
    'surgery': ['SurgeryA', 'SurgeryB', 'SurgeryC']
}
df = pd.DataFrame(data)

## 06_call_function.py
# Get embeddings and store them in Chroma
document_embeddings = embed_with_chroma(nnew_data, embedding_model)

## 07_simularity_search.py
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

# Initialize your embedding model
embedding_model = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)

# Initialize Chroma, ensure you provide the correct directory if `persist_directory` is valid
chroma = Chroma(embedding_function=embedding_model, persist_directory=DATA_PATH)

def ensure_collection_exists(chroma, collection_name):
    """Ensure the collection exists using generic methods available."""
    try:
        # Try to retrieve the collection
        collection = chroma.get(collection_name)
        return collection
    except ValueError:
        # Handle the situation if collection doesn't exist
        print(f"Collection {collection_name} does not exist.")
        return None

def search_similar_documents(chroma, collection_name, query_text, k=5):
    """Search for documents similar to the given query text within the specified collection."""
    try:
        # Use Chroma's similarity_search method
        results = chroma.similarity_search(query_text, k=k, collection_name=collection_name)
        return results
    except Exception as e:
        print(f"An error occurred during the search: {e}")
        return []

# Example usage
collection_name = "GP_Surgery_Reviews"
collection = ensure_collection_exists(chroma, collection_name)

if collection:
    query_text = "Appointment Availability"
    similar_documents = search_similar_documents(chroma, collection_name, query_text)

    # Display the results
    for doc in similar_documents:
        try:
            print(f"Document: {doc.page_content}, Similarity Score: {doc.metadata['score']}")
        except KeyError:
            print("Error processing document data; required keys not found.")
	import pandas as pd
	from langchain.schema import Document
	from langchain_community.embeddings import OllamaEmbeddings
	from langchain_community.vectorstores import Chroma
	from tqdm import tqdm
	# Initialize the embedding model
	embedding_model = OllamaEmbeddings(model="nomic-embed-text", show_progress=False)

	# Initialize Chroma Vector Store (this assumes that you do not need to from_documents here directly)
	# Assuming vector_db needs to be setup only once
	vector_db = Chroma(collection_name="GP_Surgery_Reviews")
	def embed_with_chroma(df, embedding_model):
	embeddings = []

	# Process each row in the DataFrame with a progress bar
	for index, row in tqdm(df.iterrows(), total=df.shape[0]):
	# Create a Document with necessary fields
	document = Document(
	page_content=row['review'], # Text content for embedding
	meta_data={'pcn': row['pcn'], 'surgery': row['surgery']}, # Additional meta-data
	id=str(row['index']) # Unique identifier as string
	)

	# Generate embedding using the correct embedding method
	try:
	# As 'embed_documents' expects a list of documents, we pass a list with one document
	# and then take the first (and only) embedding from the returned list
	embedding = embedding_model.embed_documents([document.page_content])[0]
	embeddings.append((document, embedding))
	except Exception as e:
	print(f"Failed to embed document: {e}")

	return embeddings
	# Example DataFrame
	data = {
	'index': [1, 2, 3],
	'review': ['Great service!', 'Needs improvement.', 'Very satisfied.'],
	'pcn': ['PCN123', 'PCN456', 'PCN789'],
	'surgery': ['SurgeryA', 'SurgeryB', 'SurgeryC']
	}
	df = pd.DataFrame(data)
	# Get embeddings and store them in Chroma
	document_embeddings = embed_with_chroma(nnew_data, embedding_model)
	from langchain_community.embeddings import OllamaEmbeddings
	from langchain_community.vectorstores import Chroma

	# Initialize your embedding model
	embedding_model = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)

	# Initialize Chroma, ensure you provide the correct directory if `persist_directory` is valid
	chroma = Chroma(embedding_function=embedding_model, persist_directory=DATA_PATH)

	def ensure_collection_exists(chroma, collection_name):
	"""Ensure the collection exists using generic methods available."""
	try:
	# Try to retrieve the collection
	collection = chroma.get(collection_name)
	return collection
	except ValueError:
	# Handle the situation if collection doesn't exist
	print(f"Collection {collection_name} does not exist.")
	return None

	def search_similar_documents(chroma, collection_name, query_text, k=5):
	"""Search for documents similar to the given query text within the specified collection."""
	try:
	# Use Chroma's similarity_search method
	results = chroma.similarity_search(query_text, k=k, collection_name=collection_name)
	return results
	except Exception as e:
	print(f"An error occurred during the search: {e}")
	return []

	# Example usage
	collection_name = "GP_Surgery_Reviews"
	collection = ensure_collection_exists(chroma, collection_name)

	if collection:
	query_text = "Appointment Availability"
	similar_documents = search_similar_documents(chroma, collection_name, query_text)

	# Display the results
	for doc in similar_documents:
	try:
	print(f"Document: {doc.page_content}, Similarity Score: {doc.metadata['score']}")
	except KeyError:
	print("Error processing document data; required keys not found.")