Skip to content

Instantly share code, notes, and snippets.

@joelthe1
Created October 28, 2019 18:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joelthe1/ac2fb77ae5b2ab12e902c49f35fbc0df to your computer and use it in GitHub Desktop.
Save joelthe1/ac2fb77ae5b2ab12e902c49f35fbc0df to your computer and use it in GitHub Desktop.
A script to create a ranked embedding pickle with BioBert.
from bert_serving.client import BertClient
from scipy import spatial
import pickle
bc = BertClient()
# load embeddings of the human subset of the UniProt entries
with open('/path/to/uniprot/protiens-/prots_human.tsv') as ref_file:
ref_rows = ref_file.read().splitlines()[1:]
print(ref_rows[:10])
ref_prots = [r.split('\t')[2] for r in ref_rows]
print(ref_prots[:10])
# get embeddings for ref prots
ref_embeddings = bc.encode(ref_prots)
print(len(ref_embeddings), len(ref_prots))
# get embedding for query
with open('/path/to/proteins/to/query-/bioc-train.csv') as query_file:
queries = query_file.read().splitlines()[1:]
query_embeddings = bc.encode(queries)
# find and rank cosine distances
results = {}
for q_idx, query_embedding in enumerate(query_embeddings):
print(q_idx, queries[q_idx])
cos_distances = []
for embed_idx, ref_embedding in enumerate(ref_embeddings):
cos_distances.append((embed_idx, spatial.distance.cosine(query_embedding, ref_embedding)))
# print(cos_distances[:10])
results[queries[q_idx]] = [(ref_prots[x[0]], x[1]) for x in sorted(cos_distances, key=lambda t: t[1])[:10]]
# print(results)
with open('/output/results.pkl', 'wb') as wfile:
pickle.dump(results, wfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment