Created
February 20, 2024 20:37
-
-
Save riccardodivirgilio/11e2733900e4d33008022f5520b4b87f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Colab: https://colab.research.google.com/drive/1YpDetI8BRbObPDEVdfqUcwhEX9UUXP-m?usp=sharing | |
import os | |
from pathlib import Path | |
from haystack import Pipeline | |
from haystack.components.converters import HTMLToDocument | |
from haystack.components.writers import DocumentWriter | |
from haystack_integrations.document_stores.chroma import ChromaDocumentStore | |
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever | |
from itertools import islice | |
def all_xml_files(): | |
return tuple( | |
islice( | |
( | |
entry.path | |
for entry in os.scandir(os.path.expanduser("~/Downloads/xmlsall/")) | |
), | |
100 | |
) | |
) | |
# Chroma is used in-memory so we use the same instances in the two pipelines below | |
document_store = ChromaDocumentStore() | |
indexing = Pipeline() | |
indexing.add_component("converter", HTMLToDocument()) | |
indexing.add_component("writer", DocumentWriter(document_store)) | |
indexing.connect("converter", "writer") | |
indexing.run({"converter": {"sources": all_xml_files()}}) | |
querying = Pipeline() | |
querying.add_component("retriever", ChromaQueryTextRetriever(document_store)) | |
results = querying.run({"retriever": {"query": "Variable declarations", "top_k": 3}}) | |
for d in results["retriever"]["documents"]: | |
print(d.meta, d.score) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment