Created
February 9, 2025 07:58
-
-
Save AIAnytime/cdfa9909e3825adb5fb7bfdd8200e5af to your computer and use it in GitHub Desktop.
Gist for adding date as metadata to chunks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from langchain.docstore.document import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import Qdrant | |
from qdrant_client import QdrantClient | |
# --- Step 1: Extract text from the document --- | |
def extract_text(file_path: str) -> str: | |
""" | |
Replace this function with actual LlamaParse code. | |
""" | |
with open(file_path, "r", encoding="utf-8") as f: | |
text = f.read() | |
return text | |
# --- Step 2: Extract date from the document text using multiple regex patterns --- | |
def extract_date_from_text(text: str) -> str: | |
""" | |
Look for a date in the text using multiple regex patterns to match various date formats. | |
Supported formats include: | |
- dd/mm/yyyy, mm/dd/yyyy, yyyy/mm/dd | |
- dd-mm-yyyy, mm-dd-yyyy, yyyy-dd-mm, yyyy-mm-dd | |
- dd.mm.yyyy, mm.dd.yyyy, yyyy.dd.mm, yyyy.mm.dd | |
- dd-mm-yy, mm-dd-yy, yy-mm-dd | |
Returns the first matched date string, or None if no date is found. | |
""" | |
date_patterns = [ | |
r"\b\d{2}/\d{2}/\d{4}\b", # dd/mm/yyyy or mm/dd/yyyy | |
r"\b\d{4}/\d{2}/\d{2}\b", # yyyy/mm/dd | |
r"\b\d{2}-\d{2}-\d{4}\b", # dd-mm-yyyy or mm-dd-yyyy | |
r"\b\d{4}-\d{2}-\d{2}\b", # yyyy-mm-dd or yyyy-dd-mm | |
r"\b\d{2}\.\d{2}\.\d{4}\b", # dd.mm.yyyy or mm.dd.yyyy | |
r"\b\d{4}\.\d{2}\.\d{2}\b", # yyyy.mm.dd or yyyy.dd.mm | |
r"\b\d{2}-\d{2}-\d{2}\b", # dd-mm-yy, mm-dd-yy, or yy-mm-dd | |
] | |
for pattern in date_patterns: | |
match = re.search(pattern, text) | |
if match: | |
return match.group(0) | |
return None | |
# --- Step 3: Process the document: split into chunks, assign date metadata, and store vectors --- | |
def process_document(file_path: str, uploaded_file_date: str) -> Qdrant: | |
# 1. Extract text from the document | |
text = extract_text(file_path) | |
# 2. Try to extract a date from the document text; if not found, use the fallback date | |
extracted_date = extract_date_from_text(text) | |
doc_date = extracted_date if extracted_date else uploaded_file_date | |
print(f"Using date: {doc_date}") | |
# 3. Split text into chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250) | |
documents = text_splitter.create_documents([text]) | |
# 4. Add the date metadata to each chunk | |
for doc in documents: | |
if not doc.metadata: | |
doc.metadata = {} | |
doc.metadata["date"] = doc_date | |
# 5. Create vectors and store in Qdrant | |
embeddings = OpenAIEmbeddings() | |
qdrant_client = QdrantClient(host="localhost", port=6333) | |
vector_store = Qdrant.from_documents( | |
documents, | |
embeddings, | |
client=qdrant_client, | |
collection_name="your_collection_name" | |
) | |
return vector_store | |
# --- Example usage --- | |
if __name__ == "__main__": | |
file_path = "path/to/your/document.txt" | |
# The fallback uploaded file's date (this can be obtained from file metadata or any other means) | |
uploaded_file_date = "2025-02-09" | |
# Process the document and store vectors in Qdrant | |
vector_store = process_document(file_path, uploaded_file_date) | |
print("Vector store created. Each chunk includes the following metadata (including the date):") | |
for doc in vector_store.docs: | |
print(doc.metadata) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment