Skip to content

Instantly share code, notes, and snippets.

@AlexeyVatolin
Last active December 17, 2024 11:06
Show Gist options
  • Save AlexeyVatolin/1569b47e186d40bac2514f82a6ee8925 to your computer and use it in GitHub Desktop.
Save AlexeyVatolin/1569b47e186d40bac2514f82a6ee8925 to your computer and use it in GitHub Desktop.
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
from mteb.models.gte_models import (
PromptType,
gte_Qwen1_5_7B_instruct,
gte_Qwen2_1_5B_instruct,
gte_Qwen2_7B_instruct,
)
def encode_with_sentence_transformer(queries, documents, model_name):
model = SentenceTransformer(model_name, trust_remote_code=True)
model.max_seq_length = 8192
query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)
scores = (query_embeddings @ document_embeddings.T) * 100
return scores
def encode_with_auto_model(queries, documents, model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
queries = [get_detailed_instruct(task, query) for query in queries]
input_texts = queries + documents
batch_dict = tokenizer(
input_texts, max_length=8192, padding=True, truncation=True, return_tensors="pt"
)
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:2] @ embeddings[2:].T) * 100
return scores.detach().numpy()
def encode_with_mteb(queries, documents, model):
model = model.load_model()
query_embeddings = model.encode(queries, task_name="MSMARCO", prompt_type=PromptType.query)
passage_embeddings = model.encode(
documents, task_name="MSMARCO", prompt_type=PromptType.passage
)
scores = (query_embeddings @ passage_embeddings.T) * 100
return scores
def compute_scores(scores1, scores2, scores3):
mse1_2 = mean_squared_error(scores1, scores2)
mse1_3 = mean_squared_error(scores1, scores3)
mse2_3 = mean_squared_error(scores2, scores3)
return mse1_2, mse1_3, mse2_3
def print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3):
print("Scores from SentenceTransformer model:")
print(scores1.tolist())
print("Scores from AutoModel:")
print(scores2.tolist())
print("Scores from mteb model:")
print(scores3.tolist())
print(f"MSE between SentenceTransformer and AutoModel: {mse1_2}")
print(f"MSE between SentenceTransformer and mteb: {mse1_3}")
print(f"MSE between AutoModel and mteb: {mse2_3}")
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[
torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths
]
def get_detailed_instruct(task_description: str, query: str) -> str:
return f"Instruct: {task_description}\nQuery: {query}"
if __name__ == "__main__":
task = "Given a web search query, retrieve relevant passages that answer the query"
queries = ["how much protein should a female eat", "summit define"]
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
]
scores1 = encode_with_sentence_transformer(
queries, documents, "Alibaba-NLP/gte-Qwen2-7B-instruct"
)
scores2 = encode_with_auto_model(queries, documents, "Alibaba-NLP/gte-Qwen2-7B-instruct")
scores3 = encode_with_mteb(queries, documents, gte_Qwen2_7B_instruct)
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3)
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3)
# Scores from SentenceTransformer model:
# [[70.39697265625, 3.4318289756774902], [4.516181945800781, 81.91806030273438]]
# Scores from AutoModel:
# [[70.3969955444336, 3.4318275451660156], [4.516171455383301, 81.91804504394531]]
# Scores from gte_Qwen2_7B_instruct model:
# [[70.39696502685547, 3.4318320751190186], [4.516174793243408, 81.91804504394531]]
# MSE between SentenceTransformer and AutoModel: 2.1719870346714742e-10
# MSE between SentenceTransformer and gte_Qwen2_7B_instruct: 8.79509798323852e-11
# MSE between AutoModel and gte_Qwen2_7B_instruct: 2.4074608973023714e-10
scores1 = encode_with_sentence_transformer(
queries, documents, "Alibaba-NLP/gte-Qwen1.5-7B-instruct"
)
scores2 = encode_with_auto_model(queries, documents, "Alibaba-NLP/gte-Qwen1.5-7B-instruct")
scores3 = encode_with_mteb(queries, documents, gte_Qwen1_5_7B_instruct)
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3)
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3)
# Scores from SentenceTransformer model:
# [[70.00669860839844, 8.184866905212402], [14.6242094039917, 77.71407318115234]]
# Scores from AutoModel:
# [[70.00666809082031, 8.184863090515137], [14.6242036819458, 77.71405029296875]]
# Scores from mteb model:
# [[70.0066909790039, 8.184870719909668], [14.62420654296875, 77.71406555175781]]
# MSE between SentenceTransformer and AutoModel: 3.7562131183221936e-10
# MSE between SentenceTransformer and mteb: 3.4788172342814505e-11
# MSE between AutoModel and mteb: 2.0577317627612501e-10
scores1 = encode_with_sentence_transformer(
queries, documents, "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
)
scores2 = encode_with_auto_model(queries, documents, "Alibaba-NLP/gte-Qwen2-1.5B-instruct")
scores3 = encode_with_mteb(queries, documents, gte_Qwen2_1_5B_instruct)
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3)
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3)
# Scores from SentenceTransformer model:
# [[78.49691772460938, 17.042865753173828], [14.924497604370117, 75.37960815429688]]
# Scores from AutoModel:
# [[78.49688720703125, 17.04286766052246], [14.924491882324219, 75.37960052490234]]
# Scores from mteb model:
# [[78.49691772460938, 17.042871475219727], [14.924491882324219, 75.37960052490234]]
# MSE between SentenceTransformer and AutoModel: 2.5647750589996576e-10
# MSE between SentenceTransformer and mteb: 3.092281986027956e-11
# MSE between AutoModel and mteb: 2.3646862246096134e-10
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
# from mteb.models.linq_models import PromptType, Linq_Embed_Mistral
def encode_with_sentence_transformer(queries, documents, task, model_name):
prompt = f"Instruct: {task}\nQuery: "
model = SentenceTransformer(model_name, trust_remote_code=True)
query_embeddings = model.encode(queries, prompt=prompt)
passage_embeddings = model.encode(documents)
scores = model.similarity(query_embeddings, passage_embeddings) * 100
return scores
def encode_with_auto_model(queries, documents, task, model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
queries = [get_detailed_instruct(task, query) for query in queries]
input_texts = queries + documents
batch_dict = tokenizer(
input_texts, max_length=4096, padding=True, truncation=True, return_tensors="pt"
)
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:2] @ embeddings[2:].T) * 100
return scores.detach().numpy()
def encode_with_mteb(queries, documents, model):
model = model.load_model()
query_embeddings = model.encode(
queries, task_name="MIRACLRetrieval", prompt_type=PromptType.query
)
passage_embeddings = model.encode(
documents, task_name="MIRACLRetrieval", prompt_type=PromptType.passage
)
scores = (query_embeddings @ passage_embeddings.T) * 100
return scores
def compute_scores(scores1, scores2, scores3):
mse1_2 = mean_squared_error(scores1, scores2)
mse1_3 = mean_squared_error(scores1, scores3)
mse2_3 = mean_squared_error(scores2, scores3)
return mse1_2, mse1_3, mse2_3
def print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3):
print("Scores from SentenceTransformer model:")
print(scores1.tolist())
print("Scores from AutoModel:")
print(scores2.tolist())
print("Scores from mteb model:")
print(scores3.tolist())
print(f"MSE between SentenceTransformer and AutoModel: {mse1_2}")
print(f"MSE between SentenceTransformer and mteb: {mse1_3}")
print(f"MSE between AutoModel and mteb: {mse2_3}")
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[
torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths
]
def get_detailed_instruct(task_description: str, query: str) -> str:
return f"Instruct: {task_description}\nQuery: {query}"
if __name__ == "__main__":
task = "Given a question, retrieve Wikipedia passages that answer the question"
queries = [
get_detailed_instruct(task, "최초의 원자력 발전소는 무엇인가?"),
get_detailed_instruct(task, "Who invented Hangul?"),
]
# No need to add instruction for retrieval documents
passages = [
"현재 사용되는 핵분열 방식을 이용한 전력생산은 1948년 9월 미국 테네시주 오크리지에 설치된 X-10 흑연원자로에서 전구의 불을 밝히는 데 사용되면서 시작되었다. 그리고 1954년 6월에 구소련의 오브닌스크에 건설된 흑연감속 비등경수 압력관형 원자로를 사용한 오브닌스크 원자력 발전소가 시험적으로 전력생산을 시작하였고, 최초의 상업용 원자력 엉더이로를 사용한 영국 셀라필드 원자력 단지에 위치한 콜더 홀(Calder Hall) 원자력 발전소로, 1956년 10월 17일 상업 운전을 시작하였다.",
"Hangul was personally created and promulgated by the fourth king of the Joseon dynasty, Sejong the Great.[1][2] Sejong's scholarly institute, the Hall of Worthies, is often credited with the work, and at least one of its scholars was heavily involved in its creation, but it appears to have also been a personal project of Sejong.",
]
scores1 = encode_with_sentence_transformer(
queries, passages, task, "Linq-AI-Research/Linq-Embed-Mistral"
)
scores2 = encode_with_auto_model(
queries, passages, task, "Linq-AI-Research/Linq-Embed-Mistral"
)
scores3 = encode_with_mteb(queries, passages, Linq_Embed_Mistral)
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3)
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3)
# Scores from SentenceTransformer model:
# [[73.65106201171875, 30.973318099975586], [29.315975189208984, 78.59217834472656]]
# Scores from AutoModel:
# [[73.65103912353516, 30.97328758239746], [29.315948486328125, 78.59219360351562]]
# Scores from mteb model:
# [[73.65101623535156, 30.97330093383789], [29.315954208374023, 78.59220123291016]]
# MSE between SentenceTransformer and AutoModel: 6.002665031701326e-10
# MSE between SentenceTransformer and mteb: 8.385541150346398e-10
# MSE between AutoModel and mteb: 1.9826984498649836e-10
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
from mteb.models.nvidia_models import PromptType, NV_embed_v2, NV_embed_v1
def encode_with_sentence_transformer(queries, documents, task, model_name):
prompt = f"Instruct: {task}\nQuery: "
model = SentenceTransformer(model_name, trust_remote_code=True)
model.max_seq_length = 32768
model.tokenizer.padding_side = "right"
def add_eos(input_examples):
input_examples = [
input_example + model.tokenizer.eos_token for input_example in input_examples
]
return input_examples
batch_size = 2
query_embeddings = model.encode(
add_eos(queries), batch_size=batch_size, prompt=prompt, normalize_embeddings=True
)
passage_embeddings = model.encode(
add_eos(passages), batch_size=batch_size, prompt="", normalize_embeddings=True
)
scores = (query_embeddings @ passage_embeddings.T) * 100
return scores
def encode_with_auto_model(queries, documents, task, model_name):
prompt = f"Instruct: {task}\nQuery: "
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
max_length = 32768
query_embeddings = model.encode(queries, instruction=prompt, max_length=max_length)
passage_embeddings = model.encode(documents, instruction="", max_length=max_length)
query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1)
scores = (query_embeddings @ passage_embeddings.T) * 100
return scores.detach().numpy()
def encode_with_mteb(queries, documents, task_name, model):
model = model.load_model()
query_embeddings = model.encode(queries, task_name=task_name, prompt_type=PromptType.query)
passage_embeddings = model.encode(
documents, task_name=task_name, prompt_type=PromptType.passage
)
scores = (query_embeddings @ passage_embeddings.T) * 100
return scores
def compute_scores(scores1, scores2, scores3):
mse1_2 = mean_squared_error(scores1, scores2)
mse1_3 = mean_squared_error(scores1, scores3)
mse2_3 = mean_squared_error(scores2, scores3)
return mse1_2, mse1_3, mse2_3
def print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3):
print("Scores from SentenceTransformer model:")
print(scores1.tolist())
print("Scores from AutoModel:")
print(scores2.tolist())
print("Scores from mteb model:")
print(scores3.tolist())
print(f"MSE between SentenceTransformer and AutoModel: {mse1_2}")
print(f"MSE between SentenceTransformer and mteb: {mse1_3}")
print(f"MSE between AutoModel and mteb: {mse2_3}")
if __name__ == "__main__":
task = "Given a question, retrieve passages that answer the question"
queries = ["최초의 원자력 발전소는 무엇인가?", "Who invented Hangul?"]
# No need to add instruction for retrieval documents
passages = [
"현재 사용되는 핵분열 방식을 이용한 전력생산은 1948년 9월 미국 테네시주 오크리지에 설치된 X-10 흑연원자로에서 전구의 불을 밝히는 데 사용되면서 시작되었다. 그리고 1954년 6월에 구소련의 오브닌스크에 건설된 흑연감속 비등경수 압력관형 원자로를 사용한 오브닌스크 원자력 발전소가 시험적으로 전력생산을 시작하였고, 최초의 상업용 원자력 엉더이로를 사용한 영국 셀라필드 원자력 단지에 위치한 콜더 홀(Calder Hall) 원자력 발전소로, 1956년 10월 17일 상업 운전을 시작하였다.",
"Hangul was personally created and promulgated by the fourth king of the Joseon dynasty, Sejong the Great.[1][2] Sejong's scholarly institute, the Hall of Worthies, is often credited with the work, and at least one of its scholars was heavily involved in its creation, but it appears to have also been a personal project of Sejong.",
]
scores1 = encode_with_sentence_transformer(queries, passages, task, "nvidia/NV-Embed-v2")
scores2 = encode_with_auto_model(queries, passages, task, "nvidia/NV-Embed-v2")
scores3 = encode_with_mteb(queries, passages, "MIRACLReranking", NV_embed_v2)
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3)
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3)
# [[74.80958557128906, 8.955424308776855], [11.394946098327637, 73.29377746582031]]
# Scores from AutoModel:
# [[75.59257507324219, 8.993165016174316], [10.881285667419434, 72.11418151855469]]
# Scores from mteb model:
# [[74.80958557128906, 8.955424308776855], [11.394946098327637, 73.29377746582031]]
# MSE between SentenceTransformer and AutoModel: 0.5674476623535156
# MSE between SentenceTransformer and mteb: 0.0
# MSE between AutoModel and mteb: 0.5674476623535156
scores1 = encode_with_sentence_transformer(queries, passages, task, "nvidia/NV-Embed-v1")
scores2 = encode_with_auto_model(queries, passages, task, "nvidia/NV-Embed-v1")
scores3 = encode_with_mteb(queries, passages, "MIRACLReranking", NV_embed_v1)
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3)
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3)
# [[61.517578125, 4.365936756134033], [6.746277809143066, 73.04427337646484]]
# Scores from AutoModel:
# [[62.26088333129883, 4.267183780670166], [6.847907543182373, 73.00653076171875]]
# Scores from mteb model:
# [[61.517578125, 4.365936756134033], [6.746277809143066, 73.04427337646484]]
# MSE between SentenceTransformer and AutoModel: 0.14350196719169617
# MSE between SentenceTransformer and mteb: 0.0
# MSE between AutoModel and mteb: 0.14350196719169617
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment