Skip to content

Instantly share code, notes, and snippets.

@alexlenail
Created November 9, 2022 12:53
Show Gist options
  • Save alexlenail/8334dd5aa689e2c7fff663ab3a036708 to your computer and use it in GitHub Desktop.
Save alexlenail/8334dd5aa689e2c7fff663ab3a036708 to your computer and use it in GitHub Desktop.
import mygene
def gene_annotations(names, map_from=['symbol', 'alias'], fields=['ensembl.gene','name','summary'], species='human'):
names = pd.Series(names)
print(f"passed {len(names)} symbols")
names_stripped = names.str.strip()
if any(names_stripped != names):
print(f"{sum(names.str.strip() != names)} names contained whitespace. Stripping...")
names_stripped_unique = names_stripped.unique()
if len(names_stripped_unique) != len(names_stripped):
print(f"{len(names_stripped) - len(names_stripped_unique)} duplicates. {len(names_stripped_unique)} uniques.")
print()
mg = mygene.MyGeneInfo()
out, dup, missing = mg.querymany(names_stripped_unique.tolist(), scopes=map_from, fields=fields, species=species, as_dataframe=True, returnall=True).values()
annotations = out.reset_index().rename(columns={'query':'input'}).sort_values(['input', '_score'], ascending=[True, False]).drop_duplicates(subset='input', keep='first')
return annotations
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment