Skip to content

Instantly share code, notes, and snippets.

@alexlenail
Created July 8, 2019 16:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexlenail/0a8d5ef061fe49337b10b2c3f9b717a6 to your computer and use it in GitHub Desktop.
Save alexlenail/0a8d5ef061fe49337b10b2c3f9b717a6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from matplotlib_venn import venn2, venn3\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"gene_id\n",
"ENSG00000000003.14 ENST00000373020.8\n",
"ENSG00000000005.5 ENST00000373031.4\n",
"ENSG00000000419.12 ENST00000371588.9\n",
"ENSG00000000457.13 ENST00000367772.8\n",
"ENSG00000000460.16 ENST00000359326.8\n",
"Name: transcript_id, dtype: object"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ucsc_canonical = pd.read_csv('/Users/alex/Desktop/knownCanonical.txt', sep='\\t', header=None, names=['chr', 'start', 'stop', 'idk', 'transcript_id', 'gene_id'])[['transcript_id', 'gene_id']].set_index('gene_id').transcript_id\n",
"ucsc_canonical.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ucsc_canonical.index.is_unique"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"gene_id\n",
"ENSG00000121410.11 ENST00000263100.7\n",
"ENSG00000268895.6 ENST00000594950.5\n",
"ENSG00000148584.15 ENST00000373997.8\n",
"ENSG00000175899.14 ENST00000318602.11\n",
"ENSG00000245105.4 ENST00000499762.2\n",
"Name: transcript_id, dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_canonical = pd.read_csv('./processed/genome.csv')\n",
"my_canonical = my_canonical[my_canonical.canonicity == 0][['gene_id', 'transcript_id']].set_index('gene_id').transcript_id\n",
"my_canonical.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_canonical.index.is_unique"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How many gene_id's are shared? "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib_venn._common.VennDiagram at 0x1365d40b8>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pylab.rcParams['figure.figsize'] = (10, 5)\n",
"venn2((set(ucsc_canonical.index), set(my_canonical.index)), ('ucsc (v29)', 'gencode (v31)'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Subsetting to just the shared gene_id's, how many transcripts are mutually canonical?"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"shared_gene_ids = set(ucsc_canonical.index) & set(my_canonical.index)\n",
"\n",
"ucsc_canonical = ucsc_canonical.loc[shared_gene_ids].rename('ucsc_canonical')\n",
"my_canonical = my_canonical.loc[shared_gene_ids].rename('my_canonical')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib_venn._common.VennDiagram at 0x137a32518>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pylab.rcParams['figure.figsize'] = (10, 5)\n",
"venn2((set(ucsc_canonical.values), set(my_canonical.values)), ('ucsc (v29)', 'gencode (v31)'))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>my_canonical</th>\n",
" <th>ucsc_canonical</th>\n",
" </tr>\n",
" <tr>\n",
" <th>gene_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ENSG00000147381.11</th>\n",
" <td>ENST00000360243.6</td>\n",
" <td>ENST00000276344.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000133316.15</th>\n",
" <td>ENST00000278856.8</td>\n",
" <td>ENST00000525239.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000108479.11</th>\n",
" <td>ENST00000588479.5</td>\n",
" <td>ENST00000225614.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000232490.6</th>\n",
" <td>ENST00000455961.1</td>\n",
" <td>ENST00000428872.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000100242.15</th>\n",
" <td>ENST00000405510.5</td>\n",
" <td>ENST00000406622.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000204136.10</th>\n",
" <td>ENST00000481799.5</td>\n",
" <td>ENST00000495328.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000164638.10</th>\n",
" <td>ENST00000297195.8</td>\n",
" <td>ENST00000396872.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000254369.6</th>\n",
" <td>ENST00000518947.6</td>\n",
" <td>ENST00000524304.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000251018.2</th>\n",
" <td>ENST00000514724.2</td>\n",
" <td>ENST00000521666.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000148985.19</th>\n",
" <td>ENST00000278243.8</td>\n",
" <td>ENST00000464906.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000214944.9</th>\n",
" <td>ENST00000437974.5</td>\n",
" <td>ENST00000545377.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000186715.11</th>\n",
" <td>ENST00000455405.6</td>\n",
" <td>ENST00000545160.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000221994.10</th>\n",
" <td>ENST00000409324.7</td>\n",
" <td>ENST00000442455.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000270433.1</th>\n",
" <td>ENST00000604278.1</td>\n",
" <td>ENST00000604456.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000182634.8</th>\n",
" <td>ENST00000330487.6</td>\n",
" <td>ENST00000641585.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000214182.5</th>\n",
" <td>ENST00000393073.4</td>\n",
" <td>ENST00000607242.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000184887.13</th>\n",
" <td>ENST00000392554.7</td>\n",
" <td>ENST00000536364.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000177200.17</th>\n",
" <td>ENST00000564845.5</td>\n",
" <td>ENST00000566029.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000182670.13</th>\n",
" <td>ENST00000354749.6</td>\n",
" <td>ENST00000355666.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000120693.13</th>\n",
" <td>ENST00000350148.9</td>\n",
" <td>ENST00000379826.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000176826.15</th>\n",
" <td>ENST00000455909.5</td>\n",
" <td>ENST00000441699.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000156531.16</th>\n",
" <td>ENST00000332070.7</td>\n",
" <td>ENST00000370803.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000261760.8</th>\n",
" <td>ENST00000561715.1</td>\n",
" <td>ENST00000636339.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000261188.1</th>\n",
" <td>ENST00000565764.1</td>\n",
" <td>ENST00000566814.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000087884.14</th>\n",
" <td>ENST00000393427.6</td>\n",
" <td>ENST00000526415.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000250379.1</th>\n",
" <td>ENST00000512295.1</td>\n",
" <td>ENST00000510176.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000265758.1</th>\n",
" <td>ENST00000585185.1</td>\n",
" <td>ENST00000584898.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000205882.8</th>\n",
" <td>ENST00000382205.4</td>\n",
" <td>ENST00000526438.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000251637.6</th>\n",
" <td>ENST00000511677.1</td>\n",
" <td>ENST00000529069.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000256274.1</th>\n",
" <td>ENST00000422992.2</td>\n",
" <td>ENST00000534866.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000168661.14</th>\n",
" <td>ENST00000303586.11</td>\n",
" <td>ENST00000439785.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000237438.7</th>\n",
" <td>ENST00000441006.5</td>\n",
" <td>ENST00000609932.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000227888.4</th>\n",
" <td>ENST00000525829.1</td>\n",
" <td>ENST00000602658.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000090097.21</th>\n",
" <td>ENST00000322099.11</td>\n",
" <td>ENST00000355852.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000163870.15</th>\n",
" <td>ENST00000355552.7</td>\n",
" <td>ENST00000648957.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000144713.12</th>\n",
" <td>ENST00000396953.6</td>\n",
" <td>ENST00000429711.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000237027.1</th>\n",
" <td>ENST00000425588.1</td>\n",
" <td>ENST00000438267.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000274333.4</th>\n",
" <td>ENST00000624444.1</td>\n",
" <td>ENST00000624627.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000188558.6</th>\n",
" <td>ENST00000343414.6</td>\n",
" <td>ENST00000641804.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000183935.5</th>\n",
" <td>ENST00000538670.1</td>\n",
" <td>ENST00000624664.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000240654.6</th>\n",
" <td>ENST00000332018.4</td>\n",
" <td>ENST00000382071.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000173209.23</th>\n",
" <td>ENST00000357022.6</td>\n",
" <td>ENST00000394457.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000185662.9</th>\n",
" <td>ENST00000523047.3</td>\n",
" <td>ENST00000330910.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000221888.4</th>\n",
" <td>ENST00000408896.4</td>\n",
" <td>ENST00000641256.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000248115.1</th>\n",
" <td>ENST00000510351.1</td>\n",
" <td>ENST00000508813.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000047579.19</th>\n",
" <td>ENST00000344537.9</td>\n",
" <td>ENST00000622898.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000204648.11</th>\n",
" <td>ENST00000608568.1</td>\n",
" <td>ENST00000376909.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000283554.1</th>\n",
" <td>ENST00000637043.1</td>\n",
" <td>ENST00000637462.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000240356.6</th>\n",
" <td>ENST00000391616.3</td>\n",
" <td>ENST00000416673.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000258940.2</th>\n",
" <td>ENST00000553520.1</td>\n",
" <td>ENST00000605298.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000262468.6</th>\n",
" <td>ENST00000573042.2</td>\n",
" <td>ENST00000649264.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000158716.8</th>\n",
" <td>ENST00000368107.1</td>\n",
" <td>ENST00000368109.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000266952.2</th>\n",
" <td>ENST00000588074.1</td>\n",
" <td>ENST00000649058.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000147274.14</th>\n",
" <td>ENST00000320676.11</td>\n",
" <td>ENST00000431446.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000114779.19</th>\n",
" <td>ENST00000361143.9</td>\n",
" <td>ENST00000483233.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000255319.5</th>\n",
" <td>ENST00000529093.1</td>\n",
" <td>ENST00000527856.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000155749.12</th>\n",
" <td>ENST00000286190.9</td>\n",
" <td>ENST00000405148.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000147789.15</th>\n",
" <td>ENST00000528372.5</td>\n",
" <td>ENST00000525266.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000173715.16</th>\n",
" <td>ENST00000360962.9</td>\n",
" <td>ENST00000525908.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ENSG00000184698.5</th>\n",
" <td>ENST00000328611.5</td>\n",
" <td>ENST00000642046.1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2468 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" my_canonical ucsc_canonical\n",
"gene_id \n",
"ENSG00000147381.11 ENST00000360243.6 ENST00000276344.6\n",
"ENSG00000133316.15 ENST00000278856.8 ENST00000525239.5\n",
"ENSG00000108479.11 ENST00000588479.5 ENST00000225614.6\n",
"ENSG00000232490.6 ENST00000455961.1 ENST00000428872.6\n",
"ENSG00000100242.15 ENST00000405510.5 ENST00000406622.5\n",
"ENSG00000204136.10 ENST00000481799.5 ENST00000495328.5\n",
"ENSG00000164638.10 ENST00000297195.8 ENST00000396872.7\n",
"ENSG00000254369.6 ENST00000518947.6 ENST00000524304.1\n",
"ENSG00000251018.2 ENST00000514724.2 ENST00000521666.1\n",
"ENSG00000148985.19 ENST00000278243.8 ENST00000464906.6\n",
"ENSG00000214944.9 ENST00000437974.5 ENST00000545377.5\n",
"ENSG00000186715.11 ENST00000455405.6 ENST00000545160.1\n",
"ENSG00000221994.10 ENST00000409324.7 ENST00000442455.7\n",
"ENSG00000270433.1 ENST00000604278.1 ENST00000604456.1\n",
"ENSG00000182634.8 ENST00000330487.6 ENST00000641585.1\n",
"ENSG00000214182.5 ENST00000393073.4 ENST00000607242.1\n",
"ENSG00000184887.13 ENST00000392554.7 ENST00000536364.5\n",
"ENSG00000177200.17 ENST00000564845.5 ENST00000566029.5\n",
"ENSG00000182670.13 ENST00000354749.6 ENST00000355666.5\n",
"ENSG00000120693.13 ENST00000350148.9 ENST00000379826.4\n",
"ENSG00000176826.15 ENST00000455909.5 ENST00000441699.1\n",
"ENSG00000156531.16 ENST00000332070.7 ENST00000370803.7\n",
"ENSG00000261760.8 ENST00000561715.1 ENST00000636339.2\n",
"ENSG00000261188.1 ENST00000565764.1 ENST00000566814.1\n",
"ENSG00000087884.14 ENST00000393427.6 ENST00000526415.5\n",
"ENSG00000250379.1 ENST00000512295.1 ENST00000510176.1\n",
"ENSG00000265758.1 ENST00000585185.1 ENST00000584898.1\n",
"ENSG00000205882.8 ENST00000382205.4 ENST00000526438.5\n",
"ENSG00000251637.6 ENST00000511677.1 ENST00000529069.5\n",
"ENSG00000256274.1 ENST00000422992.2 ENST00000534866.1\n",
"... ... ...\n",
"ENSG00000168661.14 ENST00000303586.11 ENST00000439785.5\n",
"ENSG00000237438.7 ENST00000441006.5 ENST00000609932.5\n",
"ENSG00000227888.4 ENST00000525829.1 ENST00000602658.1\n",
"ENSG00000090097.21 ENST00000322099.11 ENST00000355852.6\n",
"ENSG00000163870.15 ENST00000355552.7 ENST00000648957.1\n",
"ENSG00000144713.12 ENST00000396953.6 ENST00000429711.6\n",
"ENSG00000237027.1 ENST00000425588.1 ENST00000438267.1\n",
"ENSG00000274333.4 ENST00000624444.1 ENST00000624627.3\n",
"ENSG00000188558.6 ENST00000343414.6 ENST00000641804.1\n",
"ENSG00000183935.5 ENST00000538670.1 ENST00000624664.1\n",
"ENSG00000240654.6 ENST00000332018.4 ENST00000382071.6\n",
"ENSG00000173209.23 ENST00000357022.6 ENST00000394457.7\n",
"ENSG00000185662.9 ENST00000523047.3 ENST00000330910.7\n",
"ENSG00000221888.4 ENST00000408896.4 ENST00000641256.1\n",
"ENSG00000248115.1 ENST00000510351.1 ENST00000508813.1\n",
"ENSG00000047579.19 ENST00000344537.9 ENST00000622898.4\n",
"ENSG00000204648.11 ENST00000608568.1 ENST00000376909.6\n",
"ENSG00000283554.1 ENST00000637043.1 ENST00000637462.1\n",
"ENSG00000240356.6 ENST00000391616.3 ENST00000416673.6\n",
"ENSG00000258940.2 ENST00000553520.1 ENST00000605298.1\n",
"ENSG00000262468.6 ENST00000573042.2 ENST00000649264.1\n",
"ENSG00000158716.8 ENST00000368107.1 ENST00000368109.5\n",
"ENSG00000266952.2 ENST00000588074.1 ENST00000649058.1\n",
"ENSG00000147274.14 ENST00000320676.11 ENST00000431446.7\n",
"ENSG00000114779.19 ENST00000361143.9 ENST00000483233.5\n",
"ENSG00000255319.5 ENST00000529093.1 ENST00000527856.5\n",
"ENSG00000155749.12 ENST00000286190.9 ENST00000405148.6\n",
"ENSG00000147789.15 ENST00000528372.5 ENST00000525266.5\n",
"ENSG00000173715.16 ENST00000360962.9 ENST00000525908.6\n",
"ENSG00000184698.5 ENST00000328611.5 ENST00000642046.1\n",
"\n",
"[2468 rows x 2 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"canonicals = pd.concat((my_canonical, ucsc_canonical), axis=1)\n",
"canonicals[canonicals.my_canonical != canonicals.ucsc_canonical]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment