Skip to content

Instantly share code, notes, and snippets.

@neuromusic
Last active March 5, 2019 03:59
Show Gist options
  • Save neuromusic/6ab7769c2030eec573b61b03a8021620 to your computer and use it in GitHub Desktop.
Save neuromusic/6ab7769c2030eec573b61b03a8021620 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import scanpy.api as sc"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"tm = sc.read('TM_facs_processed.h5ad')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['1500015L24Rik', '1500016L03Rik', '1600029D21Rik', '1700019G17Rik',\n",
" '1700092K14Rik', '1810024B03Rik', '1810030J14Rik', '1810065E05Rik',\n",
" '2010107G23Rik', '2210010C04Rik',\n",
" ...\n",
" 'Xpnpep3', 'Zdhhc3', 'Zeb2', 'Zfp160', 'Zfp597', 'Zfp609', 'Zfp964',\n",
" 'Zg16', 'Zhx1', 'Zrsr2'],\n",
" dtype='object', name='index', length=999)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tm.var_names"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import mygene\n",
"mg = mygene.MyGeneInfo()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'max_score': 467.7304,\n",
" 'took': 17,\n",
" 'total': 2,\n",
" 'hits': [{'_id': '68994',\n",
" '_score': 467.7304,\n",
" 'entrezgene': '68994',\n",
" 'name': 'RIKEN cDNA 1500015L24 gene',\n",
" 'symbol': '1500015L24Rik',\n",
" 'taxid': 10090},\n",
" {'_id': 'ENSMUSG00000094732',\n",
" '_score': 466.9848,\n",
" 'name': 'RIKEN cDNA 1500015L24 gene',\n",
" 'symbol': '1500015L24Rik',\n",
" 'taxid': 10090}]}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mg.query(tm.var_names[0])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"results = []\n",
"for gene_name in tm.var_names:\n",
" r = mg.query(gene_name)\n",
" \n",
" n_hits = r['total']\n",
" if n_hits > 0:\n",
" for hit in r['hits']:\n",
" try:\n",
" entrez_gene = hit['entrezgene']\n",
" name = hit['name']\n",
" break\n",
" except KeyError:\n",
" pass\n",
" \n",
" results.append(dict(\n",
" n_hits=n_hits,\n",
" entrez_gene=entrez_gene,\n",
" name=name,\n",
" gene_name=gene_name,\n",
" ))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"results = pd.DataFrame(results)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entrez_gene</th>\n",
" <th>gene_name</th>\n",
" <th>n_hits</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>68994</td>\n",
" <td>1500015L24Rik</td>\n",
" <td>2</td>\n",
" <td>RIKEN cDNA 1500015L24 gene</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>78365</td>\n",
" <td>1500016L03Rik</td>\n",
" <td>1</td>\n",
" <td>LIM homeobox 1, opposite strand</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>76509</td>\n",
" <td>1600029D21Rik</td>\n",
" <td>1</td>\n",
" <td>placenta expressed transcript 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>75541</td>\n",
" <td>1700019G17Rik</td>\n",
" <td>1</td>\n",
" <td>N-acetyltransferase 8 (GCN5-related) family me...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>73536</td>\n",
" <td>1700092K14Rik</td>\n",
" <td>2</td>\n",
" <td>RIKEN cDNA 1700092K14 gene</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" entrez_gene gene_name n_hits \\\n",
"0 68994 1500015L24Rik 2 \n",
"1 78365 1500016L03Rik 1 \n",
"2 76509 1600029D21Rik 1 \n",
"3 75541 1700019G17Rik 1 \n",
"4 73536 1700092K14Rik 2 \n",
"\n",
" name \n",
"0 RIKEN cDNA 1500015L24 gene \n",
"1 LIM homeobox 1, opposite strand \n",
"2 placenta expressed transcript 1 \n",
"3 N-acetyltransferase 8 (GCN5-related) family me... \n",
"4 RIKEN cDNA 1700092K14 gene "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_hits</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>999.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>246.327327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>964.286274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>194.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>271.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>18649.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_hits\n",
"count 999.000000\n",
"mean 246.327327\n",
"std 964.286274\n",
"min 0.000000\n",
"25% 6.000000\n",
"50% 194.000000\n",
"75% 271.000000\n",
"max 18649.000000"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results.describe()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.92992992992993"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(results['n_hits'] > 0).mean()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"results.to_csv('mygene_query_results.csv')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"querying 1-999...done.\n"
]
}
],
"source": [
"summaries = mg.getgenes(results['entrez_gene'],fields='summary')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"results['summary'] = [s.get('summary','') for s in summaries]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6066066066066066"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(results['summary'].map(len)>0).mean()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"with_summaries = results['summary'][results['summary'].map(len)>0]"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This gene encodes a member of the Ly6/PLAUR family of cysteine-rich proteins that plays an important role in the protection of colonic epithelium from flagellated microbiota. The encoded protein undergoes proteolytic processing to generate a mature, glycosylphosphatidylinositol-anchored protein that is localized to the apical surface of the colonic epithelial cells. Mice lacking the encoded protein are sensitive to chemically induced intestinal inflammation. [provided by RefSeq, Aug 2016].\n",
"\n",
"The protein encoded by this gene is a plasma glycoprotein of unknown function. The protein shows sequence similarity to the variable regions of some immunoglobulin supergene family member proteins. [provided by RefSeq, Jul 2008].\n",
"\n",
"This gene encodes a protein that is highly similar to mouse and rat kynurenine aminotransferase II. The rat protein is a homodimer with two transaminase activities. One activity is the transamination of alpha-aminoadipic acid, a final step in the saccaropine pathway which is the major pathway for L-lysine catabolism. The other activity involves the transamination of kynurenine to produce kynurenine acid, the precursor of kynurenic acid which has neuroprotective properties. Several transcript variants encoding two different isoforms have been found for this gene. [provided by RefSeq, Nov 2013].\n",
"\n",
"The membrane-associated protein encoded by this gene is a member of the superfamily of ATP-binding cassette (ABC) transporters. ABC proteins transport various molecules across extra- and intracellular membranes. ABC genes are divided into seven distinct subfamilies (ABC1, MDR/TAP, MRP, ALD, OABP, GCN20, and White). This encoded protein is a member of the ABC1 subfamily. Members of the ABC1 subfamily comprise the only major ABC subfamily found exclusively in multicellular eukaryotes. This gene is clustered among 4 other ABC1 family members on 17q24, but neither the substrate nor the function of this gene is known. Alternative splicing of this gene results in several transcript variants; however, not all variants have been fully described. [provided by RefSeq, Jul 2008].\n",
"\n",
"The protein encoded by this gene is a member of the superfamily of ATP-binding cassette (ABC) transporters. ABC proteins transport various molecules across extra- and intra-cellular membranes. ABC genes are divided into seven distinct subfamilies (ABC1, MDR/TAP, MRP, ALD, OABP, GCN20, White). This protein is a member of the MRP subfamily which is involved in multi-drug resistance. This protein is thought to form ATP-sensitive potassium channels in cardiac, skeletal, and vascular and non-vascular smooth muscle. Protein structure suggests a role as the drug-binding channel-modulating subunit of the extra-pancreatic ATP-sensitive potassium channels. Mutations in this gene are associated with cardiomyopathy dilated type 1O. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Apr 2011].\n",
"\n",
"The protein encoded by this gene possesses long-chain acyl-CoA synthetase activity. It is thought to play a central role in brain very long-chain fatty acids metabolism and myelinogenesis. [provided by RefSeq, Jul 2008].\n",
"\n",
"The product encoded by this gene belongs to the actin family of proteins, which are highly conserved proteins that play a role in cell motility, structure and integrity. Alpha, beta and gamma actin isoforms have been identified, with alpha actins being a major constituent of the contractile apparatus, while beta and gamma actins are involved in the regulation of cell motility. This actin is an alpha actin that is found in skeletal muscle. Mutations in this gene cause nemaline myopathy type 3, congenital myopathy with excess of thin myofilaments, congenital myopathy with cores, and congenital myopathy with fiber-type disproportion, diseases that lead to muscle fiber defects. [provided by RefSeq, Jul 2008].\n",
"\n",
"This gene encodes one of six different actin proteins. Actins are highly conserved proteins that are involved in cell motility, structure, integrity, and intercellular signaling. The encoded protein is a smooth muscle actin that is involved in vascular contractility and blood pressure homeostasis. Mutations in this gene cause a variety of vascular diseases, such as thoracic aortic disease, coronary artery disease, stroke, and Moyamoya disease, as well as multisystemic smooth muscle dysfunction syndrome. [provided by RefSeq, Sep 2017].\n",
"\n",
"Actins are highly conserved proteins that are involved in various types of cell motility. Polymerization of globular actin (G-actin) leads to a structural filament (F-actin) in the form of a two-stranded helix. Each actin can bind to four others. The protein encoded by this gene belongs to the actin family which is comprised of three main groups of actin isoforms, alpha, beta, and gamma. The alpha actins are found in muscle tissues and are a major constituent of the contractile apparatus. Defects in this gene have been associated with idiopathic dilated cardiomyopathy (IDC) and familial hypertrophic cardiomyopathy (FHC). [provided by RefSeq, Jul 2008].\n",
"\n",
"Alpha actinins belong to the spectrin gene superfamily which represents a diverse group of cytoskeletal proteins, including the alpha and beta spectrins and dystrophins. Alpha actinin is an actin-binding protein with multiple roles in different cell types. In nonmuscle cells, the cytoskeletal isoform is found along microfilament bundles and adherens-type junctions, where it is involved in binding actin to the membrane. In contrast, skeletal, cardiac, and smooth muscle isoforms are localized to the Z-disc and analogous dense bodies, where they help anchor the myofibrillar actin filaments. This gene encodes a muscle-specific, alpha actinin isoform that is expressed in both skeletal and cardiac muscles. Several transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, May 2013].\n",
"\n"
]
}
],
"source": [
"for s in with_summaries[:10]:\n",
" print(s)\n",
" print('')"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"488"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with_summaries.map(len).idxmax()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Hyaluronan or hyaluronic acid (HA) is a high molecular weight unbranched polysaccharide synthesized by a wide variety of organisms from bacteria to mammals, and is a constituent of the extracellular matrix. It consists of alternating glucuronic acid and N-acetylglucosamine residues that are linked by beta-1-3 and beta-1-4 glycosidic bonds. HA is synthesized by membrane-bound synthase at the inner surface of the plasma membrane, and the chains are extruded through pore-like structures into the extracellular space. It serves a variety of functions, including space filling, lubrication of joints, and provision of a matrix through which cells can migrate. HA is actively produced during wound healing and tissue repair to provide a framework for ingrowth of blood vessels and fibroblasts. Changes in the serum concentration of HA are associated with inflammatory and degenerative arthropathies such as rheumatoid arthritis. In addition, the interaction of HA with the leukocyte receptor CD44 is important in tissue-specific homing by leukocytes, and overexpression of HA receptors has been correlated with tumor metastasis. HAS1 is a member of the newly identified vertebrate gene family encoding putative hyaluronan synthases, and its amino acid sequence shows significant homology to the hasA gene product of Streptococcus pyogenes, a glycosaminoglycan synthetase (DG42) from Xenopus laevis, and a recently described murine hyaluronan synthase. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jul 2014].'"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with_summaries[488]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "mygene-dev",
"language": "python",
"name": "mygene-dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment