Skip to content

Instantly share code, notes, and snippets.

@TheMcSebi
Created February 12, 2024 14:13
Show Gist options
  • Save TheMcSebi/424a45287aad073a2a7b31633d84e56f to your computer and use it in GitHub Desktop.
Save TheMcSebi/424a45287aad073a2a7b31633d84e56f to your computer and use it in GitHub Desktop.
RAGatouille index creation by directory scan
from ragatouille import RAGPretrainedModel
import os, glob
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
def create_document_corpus(folderpath: str, filetype: str = "txt"):
my_documents = []
document_ids = []
document_metadatas = []
for i, filepath in enumerate(glob.glob(os.path.join(folderpath, f"**/*.{filetype}"), recursive=True)):
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
my_documents.append(f.read())
document_ids.append(f"{i}-{os.path.basename(filepath)}")
document_metadatas.append({"source": "file", "path": filepath})
return my_documents, document_ids, document_metadatas
if __name__ == "__main__":
coll, ids, meta = create_document_corpus("/mnt/d/Users/admin/OneDrive/Documents/Obsidian", filetype="md")
index_path = RAG.index(
index_name="obsidian_index",
collection=coll,
document_ids=ids,
document_metadatas=meta,
)
print("index path", index_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment