Skip to content

Instantly share code, notes, and snippets.

@AIAnytime
Created February 9, 2025 07:58
Show Gist options
  • Save AIAnytime/cdfa9909e3825adb5fb7bfdd8200e5af to your computer and use it in GitHub Desktop.
Save AIAnytime/cdfa9909e3825adb5fb7bfdd8200e5af to your computer and use it in GitHub Desktop.
Gist for adding date as metadata to chunks
import re
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
# --- Step 1: Extract text from the document ---
def extract_text(file_path: str) -> str:
"""
Replace this function with actual LlamaParse code.
"""
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
return text
# --- Step 2: Extract date from the document text using multiple regex patterns ---
def extract_date_from_text(text: str) -> str:
"""
Look for a date in the text using multiple regex patterns to match various date formats.
Supported formats include:
- dd/mm/yyyy, mm/dd/yyyy, yyyy/mm/dd
- dd-mm-yyyy, mm-dd-yyyy, yyyy-dd-mm, yyyy-mm-dd
- dd.mm.yyyy, mm.dd.yyyy, yyyy.dd.mm, yyyy.mm.dd
- dd-mm-yy, mm-dd-yy, yy-mm-dd
Returns the first matched date string, or None if no date is found.
"""
date_patterns = [
r"\b\d{2}/\d{2}/\d{4}\b", # dd/mm/yyyy or mm/dd/yyyy
r"\b\d{4}/\d{2}/\d{2}\b", # yyyy/mm/dd
r"\b\d{2}-\d{2}-\d{4}\b", # dd-mm-yyyy or mm-dd-yyyy
r"\b\d{4}-\d{2}-\d{2}\b", # yyyy-mm-dd or yyyy-dd-mm
r"\b\d{2}\.\d{2}\.\d{4}\b", # dd.mm.yyyy or mm.dd.yyyy
r"\b\d{4}\.\d{2}\.\d{2}\b", # yyyy.mm.dd or yyyy.dd.mm
r"\b\d{2}-\d{2}-\d{2}\b", # dd-mm-yy, mm-dd-yy, or yy-mm-dd
]
for pattern in date_patterns:
match = re.search(pattern, text)
if match:
return match.group(0)
return None
# --- Step 3: Process the document: split into chunks, assign date metadata, and store vectors ---
def process_document(file_path: str, uploaded_file_date: str) -> Qdrant:
# 1. Extract text from the document
text = extract_text(file_path)
# 2. Try to extract a date from the document text; if not found, use the fallback date
extracted_date = extract_date_from_text(text)
doc_date = extracted_date if extracted_date else uploaded_file_date
print(f"Using date: {doc_date}")
# 3. Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
documents = text_splitter.create_documents([text])
# 4. Add the date metadata to each chunk
for doc in documents:
if not doc.metadata:
doc.metadata = {}
doc.metadata["date"] = doc_date
# 5. Create vectors and store in Qdrant
embeddings = OpenAIEmbeddings()
qdrant_client = QdrantClient(host="localhost", port=6333)
vector_store = Qdrant.from_documents(
documents,
embeddings,
client=qdrant_client,
collection_name="your_collection_name"
)
return vector_store
# --- Example usage ---
if __name__ == "__main__":
file_path = "path/to/your/document.txt"
# The fallback uploaded file's date (this can be obtained from file metadata or any other means)
uploaded_file_date = "2025-02-09"
# Process the document and store vectors in Qdrant
vector_store = process_document(file_path, uploaded_file_date)
print("Vector store created. Each chunk includes the following metadata (including the date):")
for doc in vector_store.docs:
print(doc.metadata)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment