tazarov/test_chunking_chroma.py

## test_chunking_chroma.py
import uuid

from chromadb.utils import embedding_functions
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

long_text = """
The Downsides of LLMs (Logical Language Models)
In the age of artificial intelligence, Logical Language Models (LLMs) represent a significant leap forward in the field of natural language processing. These models are capable of comprehending, generating, and reasoning about human languages in a way that mimics human-like understanding. While the benefits of LLMs are numerous, it's essential to also recognize the downsides that accompany these advancements. This essay will explore the negative aspects of LLMs in terms of ethics, job displacement, security, and potential biases.

1. Ethical Concerns
The deployment of LLMs in various sectors raises serious ethical questions. For instance, if an LLM is utilized in a legal or medical context, it could inadvertently provide incorrect advice, leading to harmful consequences. Ensuring accountability in these scenarios becomes a complex issue. Moreover, the potential misuse of LLMs to generate misleading or malicious content can cause significant societal harm.

Privacy
LLMs often require massive amounts of data for training. The collection and use of personal and sensitive data can lead to privacy infringements. Ensuring the proper handling and anonymization of this information is a significant challenge, and failure to do so can result in legal and ethical violations.

2. Job Displacement
The automation of tasks through LLMs can lead to job displacement in various industries, particularly those that rely heavily on language-based tasks. Jobs in customer service, content creation, and even some legal and medical professions could be at risk. This displacement may result in social unrest and increased inequality, as those lacking the skills to adapt to new technology-driven roles may find themselves marginalized.

3. Security Risks
LLMs can be exploited for malicious purposes, such as generating fake news, phishing emails, or other deceptive content. These models may be used to manipulate public opinion or commit fraud. The complexity and potential power of LLMs make them a target for cybercriminals, and defending against such attacks requires continuous vigilance and resources.

4. Potential Biases
The algorithms behind LLMs are only as unbiased as the data they are trained on. If the training data includes biases related to gender, race, or other factors, the LLM may inadvertently perpetuate these biases. This can result in discriminatory practices or reinforce harmful stereotypes. Identifying and correcting these biases is a complex and ongoing challenge that necessitates a commitment to fairness and equality.

5. Environmental Impact
Training and running LLMs require significant computational resources, leading to substantial energy consumption. The environmental impact of these models, in terms of carbon footprint, is an often-overlooked downside. The push for more powerful models may exacerbate this issue, making it crucial to consider sustainability in the development and deployment of LLMs.

Conclusion
Logical Language Models represent an exciting frontier in artificial intelligence, offering potential benefits across various domains. However, the downsides of LLMs are substantial and warrant careful consideration. Ethical dilemmas, job displacement, security risks, potential biases, and environmental impacts are all vital concerns that must be addressed.

The development of guidelines, regulations, and best practices for the responsible use of LLMs is essential to mitigate these downsides. Ongoing collaboration between researchers, policymakers, industry leaders, and other stakeholders will be necessary to ensure that LLMs are developed and utilized in a manner that respects societal values and norms. The challenges posed by LLMs are not insurmountable but require thoughtful, concerted efforts to ensure that the benefits of these technological advancements do not come at an unacceptable cost.
"""

model_input_chunk_size = {
    "all-MiniLM-L6-v2": 512,
    "intfloat/multilingual-e5-large": 512,
    "text-embedding-ada-002": 2048
}


def get_max_chunk_size(model: str) -> int:
    return model_input_chunk_size[model]


def chunk_docs(docs: list, metadatas: list, max_chunk_size: int, overlap: int = -1) -> tuple:
    """
    Chunk documents into smaller documents
    :param docs: Documents
    :param metadatas: Documents metadata
    :param max_chunk_size:
    :param overlap: - if -1 then overlap is 10% of max_chunk_size
    :return:
    """
    _overlap = overlap
    if _overlap == -1:
        _overlap = int(max_chunk_size * 0.1)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_size, chunk_overlap=_overlap,
                                                   add_start_index=True)
    _documents = [Document(page_content=doc, metadata={"original_doc_id": id, **metadata}) for doc, metadata, id in
                  zip(docs, metadatas, [str(uuid.uuid4()) for _ in range(len(docs))])]
    _out_docs = text_splitter.split_documents(_documents)
    print(_out_docs)
    _new_docs = [doc.page_content for doc in _out_docs]
    _new_metas = [{"page": i, **doc.metadata} for i, doc in enumerate(_out_docs)]
    return _new_docs, _new_metas


def test_chroma():
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="intfloat/multilingual-e5-large")
    sentence_transformer_ef._model = model_input_chunk_size["intfloat/multilingual-e5-large"]
    # get the first model (usually the only)
    _max_chunk = get_max_chunk_size(next(iter(sentence_transformer_ef.models.keys())))
    _docs, _metas = chunk_docs([long_text], [{"type": "long_text"}], _max_chunk)
    print(_docs)
    print(_metas)
	import uuid

	from chromadb.utils import embedding_functions
	from langchain.schema import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	long_text = """
	The Downsides of LLMs (Logical Language Models)
	In the age of artificial intelligence, Logical Language Models (LLMs) represent a significant leap forward in the field of natural language processing. These models are capable of comprehending, generating, and reasoning about human languages in a way that mimics human-like understanding. While the benefits of LLMs are numerous, it's essential to also recognize the downsides that accompany these advancements. This essay will explore the negative aspects of LLMs in terms of ethics, job displacement, security, and potential biases.

	1. Ethical Concerns
	The deployment of LLMs in various sectors raises serious ethical questions. For instance, if an LLM is utilized in a legal or medical context, it could inadvertently provide incorrect advice, leading to harmful consequences. Ensuring accountability in these scenarios becomes a complex issue. Moreover, the potential misuse of LLMs to generate misleading or malicious content can cause significant societal harm.

	Privacy
	LLMs often require massive amounts of data for training. The collection and use of personal and sensitive data can lead to privacy infringements. Ensuring the proper handling and anonymization of this information is a significant challenge, and failure to do so can result in legal and ethical violations.

	2. Job Displacement
	The automation of tasks through LLMs can lead to job displacement in various industries, particularly those that rely heavily on language-based tasks. Jobs in customer service, content creation, and even some legal and medical professions could be at risk. This displacement may result in social unrest and increased inequality, as those lacking the skills to adapt to new technology-driven roles may find themselves marginalized.

	3. Security Risks
	LLMs can be exploited for malicious purposes, such as generating fake news, phishing emails, or other deceptive content. These models may be used to manipulate public opinion or commit fraud. The complexity and potential power of LLMs make them a target for cybercriminals, and defending against such attacks requires continuous vigilance and resources.

	4. Potential Biases
	The algorithms behind LLMs are only as unbiased as the data they are trained on. If the training data includes biases related to gender, race, or other factors, the LLM may inadvertently perpetuate these biases. This can result in discriminatory practices or reinforce harmful stereotypes. Identifying and correcting these biases is a complex and ongoing challenge that necessitates a commitment to fairness and equality.

	5. Environmental Impact
	Training and running LLMs require significant computational resources, leading to substantial energy consumption. The environmental impact of these models, in terms of carbon footprint, is an often-overlooked downside. The push for more powerful models may exacerbate this issue, making it crucial to consider sustainability in the development and deployment of LLMs.

	Conclusion
	Logical Language Models represent an exciting frontier in artificial intelligence, offering potential benefits across various domains. However, the downsides of LLMs are substantial and warrant careful consideration. Ethical dilemmas, job displacement, security risks, potential biases, and environmental impacts are all vital concerns that must be addressed.

	The development of guidelines, regulations, and best practices for the responsible use of LLMs is essential to mitigate these downsides. Ongoing collaboration between researchers, policymakers, industry leaders, and other stakeholders will be necessary to ensure that LLMs are developed and utilized in a manner that respects societal values and norms. The challenges posed by LLMs are not insurmountable but require thoughtful, concerted efforts to ensure that the benefits of these technological advancements do not come at an unacceptable cost.
	"""

	model_input_chunk_size = {
	"all-MiniLM-L6-v2": 512,
	"intfloat/multilingual-e5-large": 512,
	"text-embedding-ada-002": 2048
	}


	def get_max_chunk_size(model: str) -> int:
	return model_input_chunk_size[model]


	def chunk_docs(docs: list, metadatas: list, max_chunk_size: int, overlap: int = -1) -> tuple:
	"""
	Chunk documents into smaller documents
	:param docs: Documents
	:param metadatas: Documents metadata
	:param max_chunk_size:
	:param overlap: - if -1 then overlap is 10% of max_chunk_size
	:return:
	"""
	_overlap = overlap
	if _overlap == -1:
	_overlap = int(max_chunk_size * 0.1)
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_size, chunk_overlap=_overlap,
	add_start_index=True)
	_documents = [Document(page_content=doc, metadata={"original_doc_id": id, **metadata}) for doc, metadata, id in
	zip(docs, metadatas, [str(uuid.uuid4()) for _ in range(len(docs))])]
	_out_docs = text_splitter.split_documents(_documents)
	print(_out_docs)
	_new_docs = [doc.page_content for doc in _out_docs]
	_new_metas = [{"page": i, **doc.metadata} for i, doc in enumerate(_out_docs)]
	return _new_docs, _new_metas


	def test_chroma():
	sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name="intfloat/multilingual-e5-large")
	sentence_transformer_ef._model = model_input_chunk_size["intfloat/multilingual-e5-large"]
	# get the first model (usually the only)
	_max_chunk = get_max_chunk_size(next(iter(sentence_transformer_ef.models.keys())))
	_docs, _metas = chunk_docs([long_text], [{"type": "long_text"}], _max_chunk)
	print(_docs)
	print(_metas)