Last active
December 17, 2024 11:06
-
-
Save AlexeyVatolin/1569b47e186d40bac2514f82a6ee8925 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn.functional as F | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics import mean_squared_error | |
from torch import Tensor | |
from transformers import AutoModel, AutoTokenizer | |
from mteb.models.gte_models import ( | |
PromptType, | |
gte_Qwen1_5_7B_instruct, | |
gte_Qwen2_1_5B_instruct, | |
gte_Qwen2_7B_instruct, | |
) | |
def encode_with_sentence_transformer(queries, documents, model_name): | |
model = SentenceTransformer(model_name, trust_remote_code=True) | |
model.max_seq_length = 8192 | |
query_embeddings = model.encode(queries, prompt_name="query") | |
document_embeddings = model.encode(documents) | |
scores = (query_embeddings @ document_embeddings.T) * 100 | |
return scores | |
def encode_with_auto_model(queries, documents, model_name): | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True) | |
queries = [get_detailed_instruct(task, query) for query in queries] | |
input_texts = queries + documents | |
batch_dict = tokenizer( | |
input_texts, max_length=8192, padding=True, truncation=True, return_tensors="pt" | |
) | |
outputs = model(**batch_dict) | |
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict["attention_mask"]) | |
embeddings = F.normalize(embeddings, p=2, dim=1) | |
scores = (embeddings[:2] @ embeddings[2:].T) * 100 | |
return scores.detach().numpy() | |
def encode_with_mteb(queries, documents, model): | |
model = model.load_model() | |
query_embeddings = model.encode(queries, task_name="MSMARCO", prompt_type=PromptType.query) | |
passage_embeddings = model.encode( | |
documents, task_name="MSMARCO", prompt_type=PromptType.passage | |
) | |
scores = (query_embeddings @ passage_embeddings.T) * 100 | |
return scores | |
def compute_scores(scores1, scores2, scores3): | |
mse1_2 = mean_squared_error(scores1, scores2) | |
mse1_3 = mean_squared_error(scores1, scores3) | |
mse2_3 = mean_squared_error(scores2, scores3) | |
return mse1_2, mse1_3, mse2_3 | |
def print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3): | |
print("Scores from SentenceTransformer model:") | |
print(scores1.tolist()) | |
print("Scores from AutoModel:") | |
print(scores2.tolist()) | |
print("Scores from mteb model:") | |
print(scores3.tolist()) | |
print(f"MSE between SentenceTransformer and AutoModel: {mse1_2}") | |
print(f"MSE between SentenceTransformer and mteb: {mse1_3}") | |
print(f"MSE between AutoModel and mteb: {mse2_3}") | |
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: | |
left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0] | |
if left_padding: | |
return last_hidden_states[:, -1] | |
else: | |
sequence_lengths = attention_mask.sum(dim=1) - 1 | |
batch_size = last_hidden_states.shape[0] | |
return last_hidden_states[ | |
torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths | |
] | |
def get_detailed_instruct(task_description: str, query: str) -> str: | |
return f"Instruct: {task_description}\nQuery: {query}" | |
if __name__ == "__main__": | |
task = "Given a web search query, retrieve relevant passages that answer the query" | |
queries = ["how much protein should a female eat", "summit define"] | |
documents = [ | |
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", | |
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", | |
] | |
scores1 = encode_with_sentence_transformer( | |
queries, documents, "Alibaba-NLP/gte-Qwen2-7B-instruct" | |
) | |
scores2 = encode_with_auto_model(queries, documents, "Alibaba-NLP/gte-Qwen2-7B-instruct") | |
scores3 = encode_with_mteb(queries, documents, gte_Qwen2_7B_instruct) | |
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3) | |
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3) | |
# Scores from SentenceTransformer model: | |
# [[70.39697265625, 3.4318289756774902], [4.516181945800781, 81.91806030273438]] | |
# Scores from AutoModel: | |
# [[70.3969955444336, 3.4318275451660156], [4.516171455383301, 81.91804504394531]] | |
# Scores from gte_Qwen2_7B_instruct model: | |
# [[70.39696502685547, 3.4318320751190186], [4.516174793243408, 81.91804504394531]] | |
# MSE between SentenceTransformer and AutoModel: 2.1719870346714742e-10 | |
# MSE between SentenceTransformer and gte_Qwen2_7B_instruct: 8.79509798323852e-11 | |
# MSE between AutoModel and gte_Qwen2_7B_instruct: 2.4074608973023714e-10 | |
scores1 = encode_with_sentence_transformer( | |
queries, documents, "Alibaba-NLP/gte-Qwen1.5-7B-instruct" | |
) | |
scores2 = encode_with_auto_model(queries, documents, "Alibaba-NLP/gte-Qwen1.5-7B-instruct") | |
scores3 = encode_with_mteb(queries, documents, gte_Qwen1_5_7B_instruct) | |
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3) | |
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3) | |
# Scores from SentenceTransformer model: | |
# [[70.00669860839844, 8.184866905212402], [14.6242094039917, 77.71407318115234]] | |
# Scores from AutoModel: | |
# [[70.00666809082031, 8.184863090515137], [14.6242036819458, 77.71405029296875]] | |
# Scores from mteb model: | |
# [[70.0066909790039, 8.184870719909668], [14.62420654296875, 77.71406555175781]] | |
# MSE between SentenceTransformer and AutoModel: 3.7562131183221936e-10 | |
# MSE between SentenceTransformer and mteb: 3.4788172342814505e-11 | |
# MSE between AutoModel and mteb: 2.0577317627612501e-10 | |
scores1 = encode_with_sentence_transformer( | |
queries, documents, "Alibaba-NLP/gte-Qwen2-1.5B-instruct" | |
) | |
scores2 = encode_with_auto_model(queries, documents, "Alibaba-NLP/gte-Qwen2-1.5B-instruct") | |
scores3 = encode_with_mteb(queries, documents, gte_Qwen2_1_5B_instruct) | |
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3) | |
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3) | |
# Scores from SentenceTransformer model: | |
# [[78.49691772460938, 17.042865753173828], [14.924497604370117, 75.37960815429688]] | |
# Scores from AutoModel: | |
# [[78.49688720703125, 17.04286766052246], [14.924491882324219, 75.37960052490234]] | |
# Scores from mteb model: | |
# [[78.49691772460938, 17.042871475219727], [14.924491882324219, 75.37960052490234]] | |
# MSE between SentenceTransformer and AutoModel: 2.5647750589996576e-10 | |
# MSE between SentenceTransformer and mteb: 3.092281986027956e-11 | |
# MSE between AutoModel and mteb: 2.3646862246096134e-10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn.functional as F | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics import mean_squared_error | |
from torch import Tensor | |
from transformers import AutoModel, AutoTokenizer | |
# from mteb.models.linq_models import PromptType, Linq_Embed_Mistral | |
def encode_with_sentence_transformer(queries, documents, task, model_name): | |
prompt = f"Instruct: {task}\nQuery: " | |
model = SentenceTransformer(model_name, trust_remote_code=True) | |
query_embeddings = model.encode(queries, prompt=prompt) | |
passage_embeddings = model.encode(documents) | |
scores = model.similarity(query_embeddings, passage_embeddings) * 100 | |
return scores | |
def encode_with_auto_model(queries, documents, task, model_name): | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModel.from_pretrained(model_name) | |
queries = [get_detailed_instruct(task, query) for query in queries] | |
input_texts = queries + documents | |
batch_dict = tokenizer( | |
input_texts, max_length=4096, padding=True, truncation=True, return_tensors="pt" | |
) | |
outputs = model(**batch_dict) | |
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict["attention_mask"]) | |
embeddings = F.normalize(embeddings, p=2, dim=1) | |
scores = (embeddings[:2] @ embeddings[2:].T) * 100 | |
return scores.detach().numpy() | |
def encode_with_mteb(queries, documents, model): | |
model = model.load_model() | |
query_embeddings = model.encode( | |
queries, task_name="MIRACLRetrieval", prompt_type=PromptType.query | |
) | |
passage_embeddings = model.encode( | |
documents, task_name="MIRACLRetrieval", prompt_type=PromptType.passage | |
) | |
scores = (query_embeddings @ passage_embeddings.T) * 100 | |
return scores | |
def compute_scores(scores1, scores2, scores3): | |
mse1_2 = mean_squared_error(scores1, scores2) | |
mse1_3 = mean_squared_error(scores1, scores3) | |
mse2_3 = mean_squared_error(scores2, scores3) | |
return mse1_2, mse1_3, mse2_3 | |
def print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3): | |
print("Scores from SentenceTransformer model:") | |
print(scores1.tolist()) | |
print("Scores from AutoModel:") | |
print(scores2.tolist()) | |
print("Scores from mteb model:") | |
print(scores3.tolist()) | |
print(f"MSE between SentenceTransformer and AutoModel: {mse1_2}") | |
print(f"MSE between SentenceTransformer and mteb: {mse1_3}") | |
print(f"MSE between AutoModel and mteb: {mse2_3}") | |
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: | |
left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0] | |
if left_padding: | |
return last_hidden_states[:, -1] | |
else: | |
sequence_lengths = attention_mask.sum(dim=1) - 1 | |
batch_size = last_hidden_states.shape[0] | |
return last_hidden_states[ | |
torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths | |
] | |
def get_detailed_instruct(task_description: str, query: str) -> str: | |
return f"Instruct: {task_description}\nQuery: {query}" | |
if __name__ == "__main__": | |
task = "Given a question, retrieve Wikipedia passages that answer the question" | |
queries = [ | |
get_detailed_instruct(task, "최초의 원자력 발전소는 무엇인가?"), | |
get_detailed_instruct(task, "Who invented Hangul?"), | |
] | |
# No need to add instruction for retrieval documents | |
passages = [ | |
"현재 사용되는 핵분열 방식을 이용한 전력생산은 1948년 9월 미국 테네시주 오크리지에 설치된 X-10 흑연원자로에서 전구의 불을 밝히는 데 사용되면서 시작되었다. 그리고 1954년 6월에 구소련의 오브닌스크에 건설된 흑연감속 비등경수 압력관형 원자로를 사용한 오브닌스크 원자력 발전소가 시험적으로 전력생산을 시작하였고, 최초의 상업용 원자력 엉더이로를 사용한 영국 셀라필드 원자력 단지에 위치한 콜더 홀(Calder Hall) 원자력 발전소로, 1956년 10월 17일 상업 운전을 시작하였다.", | |
"Hangul was personally created and promulgated by the fourth king of the Joseon dynasty, Sejong the Great.[1][2] Sejong's scholarly institute, the Hall of Worthies, is often credited with the work, and at least one of its scholars was heavily involved in its creation, but it appears to have also been a personal project of Sejong.", | |
] | |
scores1 = encode_with_sentence_transformer( | |
queries, passages, task, "Linq-AI-Research/Linq-Embed-Mistral" | |
) | |
scores2 = encode_with_auto_model( | |
queries, passages, task, "Linq-AI-Research/Linq-Embed-Mistral" | |
) | |
scores3 = encode_with_mteb(queries, passages, Linq_Embed_Mistral) | |
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3) | |
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3) | |
# Scores from SentenceTransformer model: | |
# [[73.65106201171875, 30.973318099975586], [29.315975189208984, 78.59217834472656]] | |
# Scores from AutoModel: | |
# [[73.65103912353516, 30.97328758239746], [29.315948486328125, 78.59219360351562]] | |
# Scores from mteb model: | |
# [[73.65101623535156, 30.97330093383789], [29.315954208374023, 78.59220123291016]] | |
# MSE between SentenceTransformer and AutoModel: 6.002665031701326e-10 | |
# MSE between SentenceTransformer and mteb: 8.385541150346398e-10 | |
# MSE between AutoModel and mteb: 1.9826984498649836e-10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn.functional as F | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics import mean_squared_error | |
from torch import Tensor | |
from transformers import AutoModel, AutoTokenizer | |
from mteb.models.nvidia_models import PromptType, NV_embed_v2, NV_embed_v1 | |
def encode_with_sentence_transformer(queries, documents, task, model_name): | |
prompt = f"Instruct: {task}\nQuery: " | |
model = SentenceTransformer(model_name, trust_remote_code=True) | |
model.max_seq_length = 32768 | |
model.tokenizer.padding_side = "right" | |
def add_eos(input_examples): | |
input_examples = [ | |
input_example + model.tokenizer.eos_token for input_example in input_examples | |
] | |
return input_examples | |
batch_size = 2 | |
query_embeddings = model.encode( | |
add_eos(queries), batch_size=batch_size, prompt=prompt, normalize_embeddings=True | |
) | |
passage_embeddings = model.encode( | |
add_eos(passages), batch_size=batch_size, prompt="", normalize_embeddings=True | |
) | |
scores = (query_embeddings @ passage_embeddings.T) * 100 | |
return scores | |
def encode_with_auto_model(queries, documents, task, model_name): | |
prompt = f"Instruct: {task}\nQuery: " | |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True) | |
max_length = 32768 | |
query_embeddings = model.encode(queries, instruction=prompt, max_length=max_length) | |
passage_embeddings = model.encode(documents, instruction="", max_length=max_length) | |
query_embeddings = F.normalize(query_embeddings, p=2, dim=1) | |
passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1) | |
scores = (query_embeddings @ passage_embeddings.T) * 100 | |
return scores.detach().numpy() | |
def encode_with_mteb(queries, documents, task_name, model): | |
model = model.load_model() | |
query_embeddings = model.encode(queries, task_name=task_name, prompt_type=PromptType.query) | |
passage_embeddings = model.encode( | |
documents, task_name=task_name, prompt_type=PromptType.passage | |
) | |
scores = (query_embeddings @ passage_embeddings.T) * 100 | |
return scores | |
def compute_scores(scores1, scores2, scores3): | |
mse1_2 = mean_squared_error(scores1, scores2) | |
mse1_3 = mean_squared_error(scores1, scores3) | |
mse2_3 = mean_squared_error(scores2, scores3) | |
return mse1_2, mse1_3, mse2_3 | |
def print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3): | |
print("Scores from SentenceTransformer model:") | |
print(scores1.tolist()) | |
print("Scores from AutoModel:") | |
print(scores2.tolist()) | |
print("Scores from mteb model:") | |
print(scores3.tolist()) | |
print(f"MSE between SentenceTransformer and AutoModel: {mse1_2}") | |
print(f"MSE between SentenceTransformer and mteb: {mse1_3}") | |
print(f"MSE between AutoModel and mteb: {mse2_3}") | |
if __name__ == "__main__": | |
task = "Given a question, retrieve passages that answer the question" | |
queries = ["최초의 원자력 발전소는 무엇인가?", "Who invented Hangul?"] | |
# No need to add instruction for retrieval documents | |
passages = [ | |
"현재 사용되는 핵분열 방식을 이용한 전력생산은 1948년 9월 미국 테네시주 오크리지에 설치된 X-10 흑연원자로에서 전구의 불을 밝히는 데 사용되면서 시작되었다. 그리고 1954년 6월에 구소련의 오브닌스크에 건설된 흑연감속 비등경수 압력관형 원자로를 사용한 오브닌스크 원자력 발전소가 시험적으로 전력생산을 시작하였고, 최초의 상업용 원자력 엉더이로를 사용한 영국 셀라필드 원자력 단지에 위치한 콜더 홀(Calder Hall) 원자력 발전소로, 1956년 10월 17일 상업 운전을 시작하였다.", | |
"Hangul was personally created and promulgated by the fourth king of the Joseon dynasty, Sejong the Great.[1][2] Sejong's scholarly institute, the Hall of Worthies, is often credited with the work, and at least one of its scholars was heavily involved in its creation, but it appears to have also been a personal project of Sejong.", | |
] | |
scores1 = encode_with_sentence_transformer(queries, passages, task, "nvidia/NV-Embed-v2") | |
scores2 = encode_with_auto_model(queries, passages, task, "nvidia/NV-Embed-v2") | |
scores3 = encode_with_mteb(queries, passages, "MIRACLReranking", NV_embed_v2) | |
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3) | |
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3) | |
# [[74.80958557128906, 8.955424308776855], [11.394946098327637, 73.29377746582031]] | |
# Scores from AutoModel: | |
# [[75.59257507324219, 8.993165016174316], [10.881285667419434, 72.11418151855469]] | |
# Scores from mteb model: | |
# [[74.80958557128906, 8.955424308776855], [11.394946098327637, 73.29377746582031]] | |
# MSE between SentenceTransformer and AutoModel: 0.5674476623535156 | |
# MSE between SentenceTransformer and mteb: 0.0 | |
# MSE between AutoModel and mteb: 0.5674476623535156 | |
scores1 = encode_with_sentence_transformer(queries, passages, task, "nvidia/NV-Embed-v1") | |
scores2 = encode_with_auto_model(queries, passages, task, "nvidia/NV-Embed-v1") | |
scores3 = encode_with_mteb(queries, passages, "MIRACLReranking", NV_embed_v1) | |
mse1_2, mse1_3, mse2_3 = compute_scores(scores1, scores2, scores3) | |
print_report(scores1, scores2, scores3, mse1_2, mse1_3, mse2_3) | |
# [[61.517578125, 4.365936756134033], [6.746277809143066, 73.04427337646484]] | |
# Scores from AutoModel: | |
# [[62.26088333129883, 4.267183780670166], [6.847907543182373, 73.00653076171875]] | |
# Scores from mteb model: | |
# [[61.517578125, 4.365936756134033], [6.746277809143066, 73.04427337646484]] | |
# MSE between SentenceTransformer and AutoModel: 0.14350196719169617 | |
# MSE between SentenceTransformer and mteb: 0.0 | |
# MSE between AutoModel and mteb: 0.14350196719169617 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment