Skip to content

Instantly share code, notes, and snippets.

@alexlenail
Last active July 6, 2022 19:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexlenail/d1c4607712c30361333813f49ce75eef to your computer and use it in GitHub Desktop.
Save alexlenail/d1c4607712c30361333813f49ce75eef to your computer and use it in GitHub Desktop.
import mygene
def namespace_mapping(names, map_from=['symbol', 'alias'], map_to='symbol', species='human'):
names = pd.Series(names)
print(f"passed {len(names)} symbols")
names_stripped = names.str.strip()
if any(names_stripped != names):
print(f"{sum(names.str.strip() != names)} names contained whitespace. Stripping...")
names_stripped_unique = names_stripped.unique()
if len(names_stripped_unique) != len(names_stripped):
print(f"{len(names_stripped) - len(names_stripped_unique)} duplicates. {len(names_stripped_unique)} uniques.")
print()
mg = mygene.MyGeneInfo()
out, dup, missing = mg.querymany(names_stripped_unique.tolist(), scopes=map_from, fields=[map_to], species=species, as_dataframe=True, returnall=True).values()
out = out.reset_index().rename(columns={'query':'input'}).sort_values(['input', '_score'], ascending=[True, False]).drop_duplicates(subset='input', keep='first')
same = out[out.input == out[map_to]]
updates = out[(out.input != out[map_to]) & (out.notfound.isna() if 'notfound' in out else True)].set_index('input')[map_to]
print(f"\nunchanged: {len(same)}; updates: {len(updates)}; missing: {len(missing)}")
names_updated = updates.reindex(names_stripped.values)
names_updated = names_updated.fillna(names_updated.index.to_series()).values
return updates, missing, names_updated
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment