-
-
Save bendangelo/53f36aff187e64fb0357f6f1c10223f7 to your computer and use it in GitHub Desktop.
# Add to model: | |
# include Chromable | |
# chroma do | |
# hnsw_space :cosine | |
# embedding :name | |
# document :label | |
# metadata :name | |
# end | |
# Usage: | |
# Hint.upsert Hint.all | |
# @hints = Hint.chroma_query "dogs", where: {name: {"$ne": "dogs"}} | |
# Implement own embedding service: embeddings = EmbeddingsService.call texts | |
# I used https://github.com/michaelfeil/infinity | |
module Chromable | |
extend ActiveSupport::Concern | |
included do | |
class_attribute :chroma_configuration | |
end | |
class_methods do | |
def chroma(&block) | |
self.chroma_configuration = ChromaConfiguration.new | |
self.chroma_configuration.instance_eval(&block) if block_given? | |
end | |
def chroma_query query, results: 10, where: {}, where_document: {}, include: %w[documents] | |
embedding = EmbeddingsService.call query | |
col = chroma_collection | |
col.query query_embeddings: [embedding[0]], results: results, where: where, where_document: where_document, include: include | |
end | |
def chroma_collection | |
@@col ||= Chroma::Resources::Collection.get_or_create self.chroma_collection_name, { | |
"hnsw:space": self.chroma_configuration.hnsw_space_param.to_s | |
} | |
end | |
def chroma_delete | |
Chroma::Resources::Collection.delete chroma_collection_name | |
end | |
def chroma_collection_name | |
if Rails.env.test? | |
"#{self.table_name}_test" | |
else | |
self.table_name | |
end | |
end | |
def chroma_count | |
chroma_collection.count | |
end | |
def chroma_delete ids: nil, where: {}, where_document: {} | |
if ids.present? | |
ids = ids.map {|i| i.to_s } | |
end | |
chroma_collection.delete ids: ids, where: where, where_document: where_document | |
end | |
def chroma_get(ids: nil, where: {}, sort: nil, limit: nil, offset: nil, page: nil, page_size: nil, where_document: {}, include: %w[documents]) | |
if ids.present? | |
ids = ids.map {|i| i.to_s } | |
end | |
chroma_collection.get ids: ids, where: where, sort: sort, limit: limit, offset: offset, page: page, page_size: page_size, where_document: where_document, include: include | |
end | |
def chroma_upsert items | |
texts = items.pluck self.chroma_configuration.embedding_name.to_sym | |
embeddings = EmbeddingsService.call texts | |
objs = items.map.with_index do |h, i| | |
h.to_embedding embeddings[i] | |
end | |
chroma_collection.upsert objs | |
end | |
end | |
def to_embedding embedding=nil | |
embedding_hash = {id: id.to_s, embedding: embedding} | |
if self.class.chroma_configuration | |
embedding_hash.merge!(self.class.chroma_configuration.to_embedding_hash(self)) | |
end | |
Chroma::Resources::Embedding.new **embedding_hash | |
end | |
class ChromaConfiguration | |
attr_accessor :embedding_name, :document_name, :metadata_names, :hnsw_space_param | |
def initialize | |
self.hnsw_space_param = :l2 | |
end | |
def embedding(name) | |
self.embedding_name = name | |
end | |
def document(name) | |
self.document_name = name | |
end | |
def hnsw_space(name) | |
self.hnsw_space_param = name | |
end | |
def metadata(*names) | |
self.metadata_names = names | |
end | |
def to_embedding_hash(model_instance) | |
embedding_hash = {} | |
if document_name | |
embedding_hash[:document] = model_instance.send(document_name) | |
end | |
if metadata_names | |
metadata_hash = {} | |
metadata_names.each do |metadata_name| | |
metadata_hash[metadata_name.to_s] = model_instance.send(metadata_name) | |
end | |
embedding_hash[:metadata] = metadata_hash | |
end | |
embedding_hash | |
end | |
end | |
end |
Very nice module, I created this gem days ago, if you agree, we can merge some of this module functionalities in the gem.
https://github.com/AliOsm/chromableYeah no problem. You can include it in. But I stopped using Chromadb just cause it couldn't handle the scale of my app.
Can I ask about your app scale? Number of document, embeddings size, and so on? Because I'm building an app that will have ~8M documents.
Very nice module, I created this gem days ago, if you agree, we can merge some of this module functionalities in the gem.
https://github.com/AliOsm/chromableYeah no problem. You can include it in. But I stopped using Chromadb just cause it couldn't handle the scale of my app.
Can I ask about your app scale? Number of document, embeddings size, and so on? Because I'm building an app that will have ~8M documents.
Maybe I did something wrong but I tried with 100,000 or 200,000 documents and searches were taking 30seconds. I will be trying Milvus now.
I recommend you try now will just 100,000 and see how it performs
I'm trying currently with multiple document sizes. 10K is working fine, <50ms. Adding more documents.
I just tried with 100K documents and the query takes ~40ms. I think there is something wrong with your setup. Are you using the chroma-db gem notebook as is?
What's the collection type set as? L2 or cosine? And how many dimensions do you have for each embedding? I had 384
I'm using the default chromadb settings, and my embedding size is 1024.
Ok I'll test it again.
I'm using the default chromadb settings, and my embedding size is 1024.
I've found the issue. Searching takes a lot of cpu resources, so if the server is constrained for cpu time searching will slow to a crawl (postgres and other services don't have this issue). There must be some optimization on Chroma's side to fix this.
Yeah no problem. You can include it in. But I stopped using Chromadb just cause it couldn't handle the scale of my app.