Skip to content

Instantly share code, notes, and snippets.

@abodacs
Forked from wfng92/semantic-search-main.py
Created December 16, 2022 19:39
Show Gist options
  • Save abodacs/dd5bd003cc8f5c2c4743cdfc0cd25e16 to your computer and use it in GitHub Desktop.
Save abodacs/dd5bd003cc8f5c2c4743cdfc0cd25e16 to your computer and use it in GitHub Desktop.
from sentence_transformers import SentenceTransformer, util
import torch
# save model in current directory
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device='cpu', cache_folder='./')
# save model in models folder (you need to create the folder on your own beforehand)
# model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device='cpu', cache_folder='./models/')
# Corpus with example sentences
corpus = [
'I am a boy',
'What are you doing?',
'Can you help me?',
'A man is riding a horse.',
'A woman is playing violin.',
'A monkey is chasing after a goat',
'The quick brown fox jumps over the lazy dog'
]
# Query sentences:
queries = ['I am in need of assistance', '我是男孩子', 'Qué estás haciendo']
corpus_embedding = model.encode(corpus, convert_to_tensor=True)
top_k = min(5, len(corpus))
for query in queries:
query_embedding = model.encode(query, convert_to_tensor=True)
cos_scores = util.cos_sim(query_embedding, corpus_embedding)[0]
top_results = torch.topk(cos_scores, k=top_k)
print("Query:", query)
print("---------------------------")
for score, idx in zip(top_results[0], top_results[1]):
print(f'{round(score.item(), 3)} | {corpus[idx]}')
@abodacs
Copy link
Author

abodacs commented Dec 16, 2022

from sentence_transformers import SentenceTransformer, util
import torch

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device='cpu', cache_folder='/')

# Corpus with example sentences
corpus = [
    'I am a boy',
    'What are you doing?',
    'Can you help me?',
    'A man is riding a horse.',
    'A woman is playing violin.',
    'A monkey is chasing after a goat',
    'The quick brown fox jumps over the lazy dog'
]

# Query sentences:
queries = ['I am in need of assistance', '我是男孩子', 'Qué estás haciendo']

corpus_embedding = model.encode(corpus, convert_to_tensor=True, normalize_embeddings=True)

top_k = min(5, len(corpus))

for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True, normalize_embeddings=True)
    hits = util.semantic_search(query_embedding, corpus_embedding, score_function=util.dot_score)
    hits = hits[0]

    print("Query:", query)
    print("---------------------------")
    for hit in hits[:top_k]:
        print(f"{round(hit['score'], 3)} | {corpus[hit['corpus_id']]}")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment