pszemraj/embed.py

## embed.py
# pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util, models

model_name = "nomic-ai/nomic-embed-text-v1"
pooling_mode = "mean"
word_embedding_model = models.Transformer(
    model_name,
    max_seq_length=8192,
    model_args={"trust_remote_code": True, "rotary_scaling_factor": 2},
    tokenizer_args={"trust_remote_code": True},
)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode=pooling_mode,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


embeddings = model.encode(["here is some text", "oh look there is more"], batch_size=16)
print(embeddings.shape)

## embed_optimum.py
# pip install -U "optimum[onnxruntime]"
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction


def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


sentences = ["What is TSNE?", "Who is Laurens van der Maaten?"]

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=8192)
model = ORTModelForFeatureExtraction.from_pretrained(
    "nomic-ai/nomic-embed-text-v1",
    trust_remote_code=True,
    file_name="onnx/model_quantized.onnx",  # model.onnx for unquantized version
    rotary_scaling_factor=2,
)

encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    model_output = model(**encoded_input)

embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
embeddings = F.normalize(embeddings, p=2, dim=1)
print(embeddings)

## onnx_embedder.py
# pip install -U "optimum[onnxruntime]"
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction
from tqdm.auto import trange


class TextEmbedder:
    def __init__(
        self,
        model_name="nomic-ai/nomic-embed-text-v1",
        tokenizer_name="bert-base-uncased",
        model_file_name="onnx/model_quantized.onnx",
        batch_size=8,
    ):
        self.model_name = model_name
        self.tokenizer_name = tokenizer_name
        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.tokenizer_name or self.model_name, model_max_length=8192
        )
        self.model = ORTModelForFeatureExtraction.from_pretrained(
            model_name,
            trust_remote_code=True,
            file_name=model_file_name,  # Use "onnx/model.onnx" for unquantized version
            rotary_scaling_factor=2,
        )

    @staticmethod
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    def encode_batch(self, sentences_batch):
        encoded_input = self.tokenizer(
            sentences_batch, padding=True, truncation=True, return_tensors="pt"
        )
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"])
        return F.normalize(embeddings, p=2, dim=1)

    def encode(
        self,
        sentences,
        return_list=False,
        disable_progress=False,
    ):
        # Ensure sentences is a list
        if isinstance(sentences, str):
            sentences = [sentences]  # Wrap a single sentence into a list

        all_embeddings = []
        for i in trange(0, len(sentences), self.batch_size, disable=disable_progress):
            batch_sentences = sentences[i : i + self.batch_size]
            batch_embeddings = self.encode_batch(batch_sentences)
            all_embeddings.append(batch_embeddings)
        all_embeddings = torch.cat(all_embeddings, dim=0)
        return all_embeddings.cpu().tolist() if return_list else all_embeddings.cpu()

    def __str__(self):
        return (
            f"TextEmbedder(model_name={self.model_name}, batch_size={self.batch_size})"
        )

    def __repr__(self):
        return f"TextEmbedder(model_name='{self.model_name}', batch_size={self.batch_size})"

    def __call__(self, sentences, **kwargs):
        return self.encode(sentences, **kwargs)


# Example usage
if __name__ == "__main__":
    sentences = [
        "What is TSNE?",
        "Who is Laurens van der Maaten?",
        "Short sentence.",
        "Another example.",
        "Yet another query.",
        "What is machine learning?",
        "Tell me about OpenAI.",
        "Explain deep learning.",
    ]
    embedder = TextEmbedder(batch_size=3)
    embeddings = embedder.encode(sentences)
    print(embeddings)
	# pip install sentence-transformers
	from sentence_transformers import SentenceTransformer, util, models

	model_name = "nomic-ai/nomic-embed-text-v1"
	pooling_mode = "mean"
	word_embedding_model = models.Transformer(
	model_name,
	max_seq_length=8192,
	model_args={"trust_remote_code": True, "rotary_scaling_factor": 2},
	tokenizer_args={"trust_remote_code": True},
	)
	pooling_model = models.Pooling(
	word_embedding_model.get_word_embedding_dimension(),
	pooling_mode=pooling_mode,
	)
	model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


	embeddings = model.encode(["here is some text", "oh look there is more"], batch_size=16)
	print(embeddings.shape)
	# pip install -U "optimum[onnxruntime]"
	import torch
	import torch.nn.functional as F
	from transformers import AutoTokenizer
	from optimum.onnxruntime import ORTModelForFeatureExtraction


	def mean_pooling(model_output, attention_mask):
	token_embeddings = model_output[0]
	input_mask_expanded = (
	attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	)
	return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
	input_mask_expanded.sum(1), min=1e-9
	)


	sentences = ["What is TSNE?", "Who is Laurens van der Maaten?"]

	tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=8192)
	model = ORTModelForFeatureExtraction.from_pretrained(
	"nomic-ai/nomic-embed-text-v1",
	trust_remote_code=True,
	file_name="onnx/model_quantized.onnx", # model.onnx for unquantized version
	rotary_scaling_factor=2,
	)

	encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

	with torch.no_grad():
	model_output = model(**encoded_input)

	embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
	embeddings = F.normalize(embeddings, p=2, dim=1)
	print(embeddings)