Last active
February 2, 2024 16:18
-
-
Save pszemraj/e57deefedbba14f782b79ec3ec15db8f to your computer and use it in GitHub Desktop.
setting up nomic-embed-text-v1 in sbert and ONNX
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install sentence-transformers | |
from sentence_transformers import SentenceTransformer, util, models | |
model_name = "nomic-ai/nomic-embed-text-v1" | |
pooling_mode = "mean" | |
word_embedding_model = models.Transformer( | |
model_name, | |
max_seq_length=8192, | |
model_args={"trust_remote_code": True, "rotary_scaling_factor": 2}, | |
tokenizer_args={"trust_remote_code": True}, | |
) | |
pooling_model = models.Pooling( | |
word_embedding_model.get_word_embedding_dimension(), | |
pooling_mode=pooling_mode, | |
) | |
model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) | |
embeddings = model.encode(["here is some text", "oh look there is more"], batch_size=16) | |
print(embeddings.shape) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install -U "optimum[onnxruntime]" | |
import torch | |
import torch.nn.functional as F | |
from transformers import AutoTokenizer | |
from optimum.onnxruntime import ORTModelForFeatureExtraction | |
def mean_pooling(model_output, attention_mask): | |
token_embeddings = model_output[0] | |
input_mask_expanded = ( | |
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
) | |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( | |
input_mask_expanded.sum(1), min=1e-9 | |
) | |
sentences = ["What is TSNE?", "Who is Laurens van der Maaten?"] | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=8192) | |
model = ORTModelForFeatureExtraction.from_pretrained( | |
"nomic-ai/nomic-embed-text-v1", | |
trust_remote_code=True, | |
file_name="onnx/model_quantized.onnx", # model.onnx for unquantized version | |
rotary_scaling_factor=2, | |
) | |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt") | |
with torch.no_grad(): | |
model_output = model(**encoded_input) | |
embeddings = mean_pooling(model_output, encoded_input["attention_mask"]) | |
embeddings = F.normalize(embeddings, p=2, dim=1) | |
print(embeddings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install -U "optimum[onnxruntime]" | |
import torch | |
import torch.nn.functional as F | |
from transformers import AutoTokenizer | |
from optimum.onnxruntime import ORTModelForFeatureExtraction | |
from tqdm.auto import trange | |
class TextEmbedder: | |
def __init__( | |
self, | |
model_name="nomic-ai/nomic-embed-text-v1", | |
tokenizer_name="bert-base-uncased", | |
model_file_name="onnx/model_quantized.onnx", | |
batch_size=8, | |
): | |
self.model_name = model_name | |
self.tokenizer_name = tokenizer_name | |
self.batch_size = batch_size | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
self.tokenizer_name or self.model_name, model_max_length=8192 | |
) | |
self.model = ORTModelForFeatureExtraction.from_pretrained( | |
model_name, | |
trust_remote_code=True, | |
file_name=model_file_name, # Use "onnx/model.onnx" for unquantized version | |
rotary_scaling_factor=2, | |
) | |
@staticmethod | |
def mean_pooling(model_output, attention_mask): | |
token_embeddings = model_output[0] | |
input_mask_expanded = ( | |
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
) | |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( | |
input_mask_expanded.sum(1), min=1e-9 | |
) | |
def encode_batch(self, sentences_batch): | |
encoded_input = self.tokenizer( | |
sentences_batch, padding=True, truncation=True, return_tensors="pt" | |
) | |
with torch.no_grad(): | |
model_output = self.model(**encoded_input) | |
embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"]) | |
return F.normalize(embeddings, p=2, dim=1) | |
def encode( | |
self, | |
sentences, | |
return_list=False, | |
disable_progress=False, | |
): | |
# Ensure sentences is a list | |
if isinstance(sentences, str): | |
sentences = [sentences] # Wrap a single sentence into a list | |
all_embeddings = [] | |
for i in trange(0, len(sentences), self.batch_size, disable=disable_progress): | |
batch_sentences = sentences[i : i + self.batch_size] | |
batch_embeddings = self.encode_batch(batch_sentences) | |
all_embeddings.append(batch_embeddings) | |
all_embeddings = torch.cat(all_embeddings, dim=0) | |
return all_embeddings.cpu().tolist() if return_list else all_embeddings.cpu() | |
def __str__(self): | |
return ( | |
f"TextEmbedder(model_name={self.model_name}, batch_size={self.batch_size})" | |
) | |
def __repr__(self): | |
return f"TextEmbedder(model_name='{self.model_name}', batch_size={self.batch_size})" | |
def __call__(self, sentences, **kwargs): | |
return self.encode(sentences, **kwargs) | |
# Example usage | |
if __name__ == "__main__": | |
sentences = [ | |
"What is TSNE?", | |
"Who is Laurens van der Maaten?", | |
"Short sentence.", | |
"Another example.", | |
"Yet another query.", | |
"What is machine learning?", | |
"Tell me about OpenAI.", | |
"Explain deep learning.", | |
] | |
embedder = TextEmbedder(batch_size=3) | |
embeddings = embedder.encode(sentences) | |
print(embeddings) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment