AIAnytime/date_metadata.py

## date_metadata.py
import re
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient

# --- Step 1: Extract text from the document ---
def extract_text(file_path: str) -> str:
    """
    Replace this function with actual LlamaParse code.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

# --- Step 2: Extract date from the document text using multiple regex patterns ---
def extract_date_from_text(text: str) -> str:
    """
    Look for a date in the text using multiple regex patterns to match various date formats.
    Supported formats include:
      - dd/mm/yyyy, mm/dd/yyyy, yyyy/mm/dd
      - dd-mm-yyyy, mm-dd-yyyy, yyyy-dd-mm, yyyy-mm-dd
      - dd.mm.yyyy, mm.dd.yyyy, yyyy.dd.mm, yyyy.mm.dd
      - dd-mm-yy, mm-dd-yy, yy-mm-dd
    Returns the first matched date string, or None if no date is found.
    """
    date_patterns = [
        r"\b\d{2}/\d{2}/\d{4}\b",    # dd/mm/yyyy or mm/dd/yyyy
        r"\b\d{4}/\d{2}/\d{2}\b",    # yyyy/mm/dd
        r"\b\d{2}-\d{2}-\d{4}\b",    # dd-mm-yyyy or mm-dd-yyyy
        r"\b\d{4}-\d{2}-\d{2}\b",    # yyyy-mm-dd or yyyy-dd-mm
        r"\b\d{2}\.\d{2}\.\d{4}\b",  # dd.mm.yyyy or mm.dd.yyyy
        r"\b\d{4}\.\d{2}\.\d{2}\b",  # yyyy.mm.dd or yyyy.dd.mm
        r"\b\d{2}-\d{2}-\d{2}\b",    # dd-mm-yy, mm-dd-yy, or yy-mm-dd
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# --- Step 3: Process the document: split into chunks, assign date metadata, and store vectors ---
def process_document(file_path: str, uploaded_file_date: str) -> Qdrant:
    # 1. Extract text from the document
    text = extract_text(file_path)

    # 2. Try to extract a date from the document text; if not found, use the fallback date
    extracted_date = extract_date_from_text(text)
    doc_date = extracted_date if extracted_date else uploaded_file_date
    print(f"Using date: {doc_date}")

    # 3. Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
    documents = text_splitter.create_documents([text])

    # 4. Add the date metadata to each chunk
    for doc in documents:
        if not doc.metadata:
            doc.metadata = {}
        doc.metadata["date"] = doc_date

    # 5. Create vectors and store in Qdrant
    embeddings = OpenAIEmbeddings()
    qdrant_client = QdrantClient(host="localhost", port=6333)

    vector_store = Qdrant.from_documents(
        documents,
        embeddings,
        client=qdrant_client,
        collection_name="your_collection_name"
    )

    return vector_store

# --- Example usage ---
if __name__ == "__main__":
    file_path = "path/to/your/document.txt"

    # The fallback uploaded file's date (this can be obtained from file metadata or any other means)
    uploaded_file_date = "2025-02-09"

    # Process the document and store vectors in Qdrant
    vector_store = process_document(file_path, uploaded_file_date)

    print("Vector store created. Each chunk includes the following metadata (including the date):")
    for doc in vector_store.docs:
        print(doc.metadata)
	import re
	from langchain.docstore.document import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import Qdrant
	from qdrant_client import QdrantClient

	# --- Step 1: Extract text from the document ---
	def extract_text(file_path: str) -> str:
	"""
	Replace this function with actual LlamaParse code.
	"""
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()
	return text

	# --- Step 2: Extract date from the document text using multiple regex patterns ---
	def extract_date_from_text(text: str) -> str:
	"""
	Look for a date in the text using multiple regex patterns to match various date formats.
	Supported formats include:
	- dd/mm/yyyy, mm/dd/yyyy, yyyy/mm/dd
	- dd-mm-yyyy, mm-dd-yyyy, yyyy-dd-mm, yyyy-mm-dd
	- dd.mm.yyyy, mm.dd.yyyy, yyyy.dd.mm, yyyy.mm.dd
	- dd-mm-yy, mm-dd-yy, yy-mm-dd
	Returns the first matched date string, or None if no date is found.
	"""
	date_patterns = [
	r"\b\d{2}/\d{2}/\d{4}\b", # dd/mm/yyyy or mm/dd/yyyy
	r"\b\d{4}/\d{2}/\d{2}\b", # yyyy/mm/dd
	r"\b\d{2}-\d{2}-\d{4}\b", # dd-mm-yyyy or mm-dd-yyyy
	r"\b\d{4}-\d{2}-\d{2}\b", # yyyy-mm-dd or yyyy-dd-mm
	r"\b\d{2}\.\d{2}\.\d{4}\b", # dd.mm.yyyy or mm.dd.yyyy
	r"\b\d{4}\.\d{2}\.\d{2}\b", # yyyy.mm.dd or yyyy.dd.mm
	r"\b\d{2}-\d{2}-\d{2}\b", # dd-mm-yy, mm-dd-yy, or yy-mm-dd
	]

	for pattern in date_patterns:
	match = re.search(pattern, text)
	if match:
	return match.group(0)
	return None

	# --- Step 3: Process the document: split into chunks, assign date metadata, and store vectors ---
	def process_document(file_path: str, uploaded_file_date: str) -> Qdrant:
	# 1. Extract text from the document
	text = extract_text(file_path)

	# 2. Try to extract a date from the document text; if not found, use the fallback date
	extracted_date = extract_date_from_text(text)
	doc_date = extracted_date if extracted_date else uploaded_file_date
	print(f"Using date: {doc_date}")

	# 3. Split text into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
	documents = text_splitter.create_documents([text])

	# 4. Add the date metadata to each chunk
	for doc in documents:
	if not doc.metadata:
	doc.metadata = {}
	doc.metadata["date"] = doc_date

	# 5. Create vectors and store in Qdrant
	embeddings = OpenAIEmbeddings()
	qdrant_client = QdrantClient(host="localhost", port=6333)

	vector_store = Qdrant.from_documents(
	documents,
	embeddings,
	client=qdrant_client,
	collection_name="your_collection_name"
	)

	return vector_store

	# --- Example usage ---
	if __name__ == "__main__":
	file_path = "path/to/your/document.txt"

	# The fallback uploaded file's date (this can be obtained from file metadata or any other means)
	uploaded_file_date = "2025-02-09"

	# Process the document and store vectors in Qdrant
	vector_store = process_document(file_path, uploaded_file_date)

	print("Vector store created. Each chunk includes the following metadata (including the date):")
	for doc in vector_store.docs:
	print(doc.metadata)