tazarov/wal_clean.py

## wal_clean.py
#!/usr/bin/env python3
# Call the script: python wal_clean.py ./chroma-test-compact
#!/usr/bin/env python3
# Call the script: python wal_clean.py ./chroma-test-compact
import argparse
import importlib
import os
import sqlite3

import typer
from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData


def get_hnsw_index_ids(filename: str, space: str = "l2", dim: int = 384) -> list[int]:
    try:
        hnswlib = importlib.import_module("hnswlib")
    except ImportError:
        raise ImportError(
            "hnswlib is not installed. Install with `pip install chroma-hnswlib`."
        )
    index = hnswlib.Index(space=space, dim=dim)
    index.load_index(
        filename,
        is_persistent_index=True,
        max_elements=100000,
    )
    ids = index.get_ids_list().copy()
    index.close_file_handles()
    return ids


def clean_wal(chroma_persist_dir: str):
    if not os.path.exists(chroma_persist_dir):
        raise Exception(f"Persist {chroma_persist_dir} dir does not exist")
    if not os.path.exists(f"{chroma_persist_dir}/chroma.sqlite3"):
        raise Exception(
            f"SQL file not found int persist dir {chroma_persist_dir}/chroma.sqlite3"
        )
    # Connect to SQLite database
    conn = sqlite3.connect(f"{chroma_persist_dir}/chroma.sqlite3")

    # Create a cursor object
    cursor = conn.cursor()

    # SQL query
    query = "SELECT s.id as 'segment',s.topic as 'topic', c.id as 'collection' , c.dimension as 'dimension' FROM segments s LEFT JOIN collections c ON s.collection = c.id WHERE s.scope = 'VECTOR';"

    # Execute the query
    cursor.execute(query)

    # Fetch the results (if needed)
    results = cursor.fetchall()
    wal_cleanup_queries = []
    for row in results:
        # print(row)
        if os.path.exists(f"{chroma_persist_dir}/{row[0]}/index_metadata.pickle"):
            metadata = PersistentData.load_from_file(
                f"{chroma_persist_dir}/{row[0]}/index_metadata.pickle"
            )
            wal_cleanup_queries.append(
                f"DELETE FROM embeddings_queue WHERE seq_id < {metadata.max_seq_id} AND topic='{row[1]}';"
            )
        else:
            hnsw_space = cursor.execute(
                "select str_value from collection_metadata where collection_id=? and key='hnsw:space'",
                (row[2],),
            ).fetchone()
            hnsw_space = "l2" if hnsw_space is None else hnsw_space[0]
            list_of_ids = get_hnsw_index_ids(
                f"{chroma_persist_dir}/{row[0]}/", hnsw_space, row[3]
            )
            batch_size = 100
            for batch in range(0, len(list_of_ids), batch_size):
                wal_cleanup_queries.append(
                    f"DELETE FROM embeddings_queue WHERE seq_id IN ({','.join([str(i) for i in list_of_ids[batch:batch + batch_size]])});"
                )
    if len(wal_cleanup_queries) > 0:
        print("Cleaning up WAL")
        wal_cleanup_queries.append("VACUUM;")
        cursor.executescript("\n".join(wal_cleanup_queries))
    # Close the cursor and connection
    cursor.close()
    conn.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('persist_dir', type=str)
    arg = parser.parse_args()
    print(arg.persist_dir)
    clean_wal(arg.persist_dir)
	#!/usr/bin/env python3
	# Call the script: python wal_clean.py ./chroma-test-compact
	#!/usr/bin/env python3
	# Call the script: python wal_clean.py ./chroma-test-compact
	import argparse
	import importlib
	import os
	import sqlite3

	import typer
	from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData


	def get_hnsw_index_ids(filename: str, space: str = "l2", dim: int = 384) -> list[int]:
	try:
	hnswlib = importlib.import_module("hnswlib")
	except ImportError:
	raise ImportError(
	"hnswlib is not installed. Install with `pip install chroma-hnswlib`."
	)
	index = hnswlib.Index(space=space, dim=dim)
	index.load_index(
	filename,
	is_persistent_index=True,
	max_elements=100000,
	)
	ids = index.get_ids_list().copy()
	index.close_file_handles()
	return ids


	def clean_wal(chroma_persist_dir: str):
	if not os.path.exists(chroma_persist_dir):
	raise Exception(f"Persist {chroma_persist_dir} dir does not exist")
	if not os.path.exists(f"{chroma_persist_dir}/chroma.sqlite3"):
	raise Exception(
	f"SQL file not found int persist dir {chroma_persist_dir}/chroma.sqlite3"
	)
	# Connect to SQLite database
	conn = sqlite3.connect(f"{chroma_persist_dir}/chroma.sqlite3")

	# Create a cursor object
	cursor = conn.cursor()

	# SQL query
	query = "SELECT s.id as 'segment',s.topic as 'topic', c.id as 'collection' , c.dimension as 'dimension' FROM segments s LEFT JOIN collections c ON s.collection = c.id WHERE s.scope = 'VECTOR';"

	# Execute the query
	cursor.execute(query)

	# Fetch the results (if needed)
	results = cursor.fetchall()
	wal_cleanup_queries = []
	for row in results:
	# print(row)
	if os.path.exists(f"{chroma_persist_dir}/{row[0]}/index_metadata.pickle"):
	metadata = PersistentData.load_from_file(
	f"{chroma_persist_dir}/{row[0]}/index_metadata.pickle"
	)
	wal_cleanup_queries.append(
	f"DELETE FROM embeddings_queue WHERE seq_id < {metadata.max_seq_id} AND topic='{row[1]}';"
	)
	else:
	hnsw_space = cursor.execute(
	"select str_value from collection_metadata where collection_id=? and key='hnsw:space'",
	(row[2],),
	).fetchone()
	hnsw_space = "l2" if hnsw_space is None else hnsw_space[0]
	list_of_ids = get_hnsw_index_ids(
	f"{chroma_persist_dir}/{row[0]}/", hnsw_space, row[3]
	)
	batch_size = 100
	for batch in range(0, len(list_of_ids), batch_size):
	wal_cleanup_queries.append(
	f"DELETE FROM embeddings_queue WHERE seq_id IN ({','.join([str(i) for i in list_of_ids[batch:batch + batch_size]])});"
	)
	if len(wal_cleanup_queries) > 0:
	print("Cleaning up WAL")
	wal_cleanup_queries.append("VACUUM;")
	cursor.executescript("\n".join(wal_cleanup_queries))
	# Close the cursor and connection
	cursor.close()
	conn.close()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('persist_dir', type=str)
	arg = parser.parse_args()
	print(arg.persist_dir)
	clean_wal(arg.persist_dir)