Skip to content

Instantly share code, notes, and snippets.

@datasciencemonkey
Created January 17, 2022 22:09
Show Gist options
  • Save datasciencemonkey/f6f18bcb67f9a4573ba96a36cca5ca87 to your computer and use it in GitHub Desktop.
Save datasciencemonkey/f6f18bcb67f9a4573ba96a36cca5ca87 to your computer and use it in GitHub Desktop.
compute document similarity
from flair.embeddings import (
FlairEmbeddings,
TransformerWordEmbeddings,
StackedEmbeddings,
)
from flair.data import Sentence
from flair.embeddings import DocumentPoolEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
# init Flair embeddings
flair_forward_embedding = FlairEmbeddings("multi-forward")
flair_backward_embedding = FlairEmbeddings("multi-backward")
transformer_embedding = TransformerWordEmbeddings('bert-base-uncased')
# %%
baseline = Sentence("machine learning using the user history data")
utterance = Sentence("Run autoML job using the user's past data")
utterance2 = Sentence("Run sql query on user table")
stacked_embeddings = StackedEmbeddings(
[flair_forward_embedding, flair_backward_embedding, transformer_embedding]
)
document_embeddings = DocumentPoolEmbeddings([stacked_embeddings])
# %%
document_embeddings.embed(utterance)
document_embeddings.embed(utterance2)
document_embeddings.embed(baseline)
# %%
cosine_similarity(baseline.embedding.reshape(1, -1), utterance.embedding.reshape(1, -1))
# %%
cosine_similarity(baseline.embedding.reshape(1, -1), utterance2.embedding.reshape(1, -1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment