Skip to content

Instantly share code, notes, and snippets.

@tazarov
Last active January 18, 2024 12:27
Show Gist options
  • Save tazarov/fe7651a4957fb3b51e5c7fded4c8eaae to your computer and use it in GitHub Desktop.
Save tazarov/fe7651a4957fb3b51e5c7fded4c8eaae to your computer and use it in GitHub Desktop.
Clean ChromaDB WAL (⚠️ This script has been superseded by https://github.com/amikos-tech/chromadb-ops ⚠️)
#!/usr/bin/env python3
# Call the script: python wal_clean.py ./chroma-test-compact
#!/usr/bin/env python3
# Call the script: python wal_clean.py ./chroma-test-compact
import argparse
import importlib
import os
import sqlite3
import typer
from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData
def get_hnsw_index_ids(filename: str, space: str = "l2", dim: int = 384) -> list[int]:
try:
hnswlib = importlib.import_module("hnswlib")
except ImportError:
raise ImportError(
"hnswlib is not installed. Install with `pip install chroma-hnswlib`."
)
index = hnswlib.Index(space=space, dim=dim)
index.load_index(
filename,
is_persistent_index=True,
max_elements=100000,
)
ids = index.get_ids_list().copy()
index.close_file_handles()
return ids
def clean_wal(chroma_persist_dir: str):
if not os.path.exists(chroma_persist_dir):
raise Exception(f"Persist {chroma_persist_dir} dir does not exist")
if not os.path.exists(f"{chroma_persist_dir}/chroma.sqlite3"):
raise Exception(
f"SQL file not found int persist dir {chroma_persist_dir}/chroma.sqlite3"
)
# Connect to SQLite database
conn = sqlite3.connect(f"{chroma_persist_dir}/chroma.sqlite3")
# Create a cursor object
cursor = conn.cursor()
# SQL query
query = "SELECT s.id as 'segment',s.topic as 'topic', c.id as 'collection' , c.dimension as 'dimension' FROM segments s LEFT JOIN collections c ON s.collection = c.id WHERE s.scope = 'VECTOR';"
# Execute the query
cursor.execute(query)
# Fetch the results (if needed)
results = cursor.fetchall()
wal_cleanup_queries = []
for row in results:
# print(row)
if os.path.exists(f"{chroma_persist_dir}/{row[0]}/index_metadata.pickle"):
metadata = PersistentData.load_from_file(
f"{chroma_persist_dir}/{row[0]}/index_metadata.pickle"
)
wal_cleanup_queries.append(
f"DELETE FROM embeddings_queue WHERE seq_id < {metadata.max_seq_id} AND topic='{row[1]}';"
)
else:
hnsw_space = cursor.execute(
"select str_value from collection_metadata where collection_id=? and key='hnsw:space'",
(row[2],),
).fetchone()
hnsw_space = "l2" if hnsw_space is None else hnsw_space[0]
list_of_ids = get_hnsw_index_ids(
f"{chroma_persist_dir}/{row[0]}/", hnsw_space, row[3]
)
batch_size = 100
for batch in range(0, len(list_of_ids), batch_size):
wal_cleanup_queries.append(
f"DELETE FROM embeddings_queue WHERE seq_id IN ({','.join([str(i) for i in list_of_ids[batch:batch + batch_size]])});"
)
if len(wal_cleanup_queries) > 0:
print("Cleaning up WAL")
wal_cleanup_queries.append("VACUUM;")
cursor.executescript("\n".join(wal_cleanup_queries))
# Close the cursor and connection
cursor.close()
conn.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('persist_dir', type=str)
arg = parser.parse_args()
print(arg.persist_dir)
clean_wal(arg.persist_dir)
@tazarov
Copy link
Author

tazarov commented Jan 16, 2024

WARNING: Before executing the script, ensure that Chroma is stopped and not accessing the SQLite file.

NOTE: Create a backup of the /path/to/persist/dir/chroma.sqlite3 prior to running the command.

You can run this with:

python wal_clean.py /path/to/persist_dir

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment