Skip to content

Instantly share code, notes, and snippets.

@riccardodivirgilio
Created February 20, 2024 20:37
Show Gist options
  • Save riccardodivirgilio/11e2733900e4d33008022f5520b4b87f to your computer and use it in GitHub Desktop.
Save riccardodivirgilio/11e2733900e4d33008022f5520b4b87f to your computer and use it in GitHub Desktop.
# Colab: https://colab.research.google.com/drive/1YpDetI8BRbObPDEVdfqUcwhEX9UUXP-m?usp=sharing
import os
from pathlib import Path
from haystack import Pipeline
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from itertools import islice
def all_xml_files():
return tuple(
islice(
(
entry.path
for entry in os.scandir(os.path.expanduser("~/Downloads/xmlsall/"))
),
100
)
)
# Chroma is used in-memory so we use the same instances in the two pipelines below
document_store = ChromaDocumentStore()
indexing = Pipeline()
indexing.add_component("converter", HTMLToDocument())
indexing.add_component("writer", DocumentWriter(document_store))
indexing.connect("converter", "writer")
indexing.run({"converter": {"sources": all_xml_files()}})
querying = Pipeline()
querying.add_component("retriever", ChromaQueryTextRetriever(document_store))
results = querying.run({"retriever": {"query": "Variable declarations", "top_k": 3}})
for d in results["retriever"]["documents"]:
print(d.meta, d.score)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment