Skip to content

Instantly share code, notes, and snippets.

@PubuduSaneth
Created December 24, 2018 09:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PubuduSaneth/6275f5df25868e0a8132d7e40c8d317f to your computer and use it in GitHub Desktop.
Save PubuduSaneth/6275f5df25868e0a8132d7e40c8d317f to your computer and use it in GitHub Desktop.
Annotating gene coordinates and gene lists - python way
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pysam"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Section A: Format GENCODE gff3 file"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1. Read gff3 file into a pandas dataframe and examine the GENCODE file format"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>seqname</th>\n",
" <th>source</th>\n",
" <th>feature</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>score</th>\n",
" <th>strand</th>\n",
" <th>frame</th>\n",
" <th>attribute</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr1</td>\n",
" <td>HAVANA</td>\n",
" <td>gene</td>\n",
" <td>11869</td>\n",
" <td>14412</td>\n",
" <td>.</td>\n",
" <td>+</td>\n",
" <td>.</td>\n",
" <td>ID=ENSG00000223972.4;gene_id=ENSG00000223972.4...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr1</td>\n",
" <td>HAVANA</td>\n",
" <td>transcript</td>\n",
" <td>11869</td>\n",
" <td>14409</td>\n",
" <td>.</td>\n",
" <td>+</td>\n",
" <td>.</td>\n",
" <td>ID=ENST00000456328.2;Parent=ENSG00000223972.4;...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr1</td>\n",
" <td>HAVANA</td>\n",
" <td>exon</td>\n",
" <td>11869</td>\n",
" <td>12227</td>\n",
" <td>.</td>\n",
" <td>+</td>\n",
" <td>.</td>\n",
" <td>ID=exon:ENST00000456328.2:1;Parent=ENST0000045...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr1</td>\n",
" <td>HAVANA</td>\n",
" <td>exon</td>\n",
" <td>12613</td>\n",
" <td>12721</td>\n",
" <td>.</td>\n",
" <td>+</td>\n",
" <td>.</td>\n",
" <td>ID=exon:ENST00000456328.2:2;Parent=ENST0000045...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr1</td>\n",
" <td>HAVANA</td>\n",
" <td>exon</td>\n",
" <td>13221</td>\n",
" <td>14409</td>\n",
" <td>.</td>\n",
" <td>+</td>\n",
" <td>.</td>\n",
" <td>ID=exon:ENST00000456328.2:3;Parent=ENST0000045...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" seqname source feature start end score strand frame \\\n",
"0 chr1 HAVANA gene 11869 14412 . + . \n",
"1 chr1 HAVANA transcript 11869 14409 . + . \n",
"2 chr1 HAVANA exon 11869 12227 . + . \n",
"3 chr1 HAVANA exon 12613 12721 . + . \n",
"4 chr1 HAVANA exon 13221 14409 . + . \n",
"\n",
" attribute \n",
"0 ID=ENSG00000223972.4;gene_id=ENSG00000223972.4... \n",
"1 ID=ENST00000456328.2;Parent=ENSG00000223972.4;... \n",
"2 ID=exon:ENST00000456328.2:1;Parent=ENST0000045... \n",
"3 ID=exon:ENST00000456328.2:2;Parent=ENST0000045... \n",
"4 ID=exon:ENST00000456328.2:3;Parent=ENST0000045... "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gencode = pd.read_table(\"/Users/pubudu/Documents/RefData/Gencode/gencode.v19.annotation.gff3\", comment=\"#\",\n",
" sep = \"\\t\", names = ['seqname', 'source', 'feature', 'start' , 'end', 'score', 'strand', 'frame', 'attribute'])\n",
"gencode.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2615566 entries, 0 to 2615565\n",
"Data columns (total 9 columns):\n",
"seqname object\n",
"source object\n",
"feature object\n",
"start int64\n",
"end int64\n",
"score object\n",
"strand object\n",
"frame object\n",
"attribute object\n",
"dtypes: int64(2), object(7)\n",
"memory usage: 179.6+ MB\n"
]
}
],
"source": [
"gencode.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2. Extract Genes in the gff3 file \"feature = gene\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 57820 entries, 0 to 57819\n",
"Data columns (total 4 columns):\n",
"seqname 57820 non-null object\n",
"start 57820 non-null int64\n",
"end 57820 non-null int64\n",
"attribute 57820 non-null object\n",
"dtypes: int64(2), object(2)\n",
"memory usage: 1.8+ MB\n"
]
}
],
"source": [
"gencode_genes = gencode[(gencode.feature == \"gene\")][['seqname', 'start', 'end', 'attribute']].copy().reset_index().drop('index', axis=1) # Extract genes\n",
"gencode_genes.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>seqname</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>attribute</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr1</td>\n",
" <td>11869</td>\n",
" <td>14412</td>\n",
" <td>ID=ENSG00000223972.4;gene_id=ENSG00000223972.4...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr1</td>\n",
" <td>14363</td>\n",
" <td>29806</td>\n",
" <td>ID=ENSG00000227232.4;gene_id=ENSG00000227232.4...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr1</td>\n",
" <td>29554</td>\n",
" <td>31109</td>\n",
" <td>ID=ENSG00000243485.2;gene_id=ENSG00000243485.2...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr1</td>\n",
" <td>34554</td>\n",
" <td>36081</td>\n",
" <td>ID=ENSG00000237613.2;gene_id=ENSG00000237613.2...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr1</td>\n",
" <td>52473</td>\n",
" <td>54936</td>\n",
" <td>ID=ENSG00000268020.2;gene_id=ENSG00000268020.2...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" seqname start end attribute\n",
"0 chr1 11869 14412 ID=ENSG00000223972.4;gene_id=ENSG00000223972.4...\n",
"1 chr1 14363 29806 ID=ENSG00000227232.4;gene_id=ENSG00000227232.4...\n",
"2 chr1 29554 31109 ID=ENSG00000243485.2;gene_id=ENSG00000243485.2...\n",
"3 chr1 34554 36081 ID=ENSG00000237613.2;gene_id=ENSG00000237613.2...\n",
"4 chr1 52473 54936 ID=ENSG00000268020.2;gene_id=ENSG00000268020.2..."
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gencode_genes.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3. Extract gene_name, gene_type, gene_status, level of each gene"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def gene_info(x):\n",
" # Extract gene names\n",
" g_name = list(filter(lambda x: 'gene_name' in x, x.split(\";\")))[0].split(\"=\")[1]\n",
" g_type = list(filter(lambda x: 'gene_type' in x, x.split(\";\")))[0].split(\"=\")[1]\n",
" g_status = list(filter(lambda x: 'gene_status' in x, x.split(\";\")))[0].split(\"=\")[1]\n",
" g_leve = int(list(filter(lambda x: 'level' in x, x.split(\";\")))[0].split(\"=\")[1])\n",
" return (g_name, g_type, g_status, g_leve)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>seqname</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>attribute</th>\n",
" <th>gene_name</th>\n",
" <th>gene_type</th>\n",
" <th>gene_status</th>\n",
" <th>gene_level</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr1</td>\n",
" <td>11869</td>\n",
" <td>14412</td>\n",
" <td>ID=ENSG00000223972.4;gene_id=ENSG00000223972.4...</td>\n",
" <td>DDX11L1</td>\n",
" <td>pseudogene</td>\n",
" <td>KNOWN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr1</td>\n",
" <td>14363</td>\n",
" <td>29806</td>\n",
" <td>ID=ENSG00000227232.4;gene_id=ENSG00000227232.4...</td>\n",
" <td>WASH7P</td>\n",
" <td>pseudogene</td>\n",
" <td>KNOWN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr1</td>\n",
" <td>29554</td>\n",
" <td>31109</td>\n",
" <td>ID=ENSG00000243485.2;gene_id=ENSG00000243485.2...</td>\n",
" <td>MIR1302-11</td>\n",
" <td>lincRNA</td>\n",
" <td>NOVEL</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr1</td>\n",
" <td>34554</td>\n",
" <td>36081</td>\n",
" <td>ID=ENSG00000237613.2;gene_id=ENSG00000237613.2...</td>\n",
" <td>FAM138A</td>\n",
" <td>lincRNA</td>\n",
" <td>KNOWN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr1</td>\n",
" <td>52473</td>\n",
" <td>54936</td>\n",
" <td>ID=ENSG00000268020.2;gene_id=ENSG00000268020.2...</td>\n",
" <td>OR4G4P</td>\n",
" <td>pseudogene</td>\n",
" <td>KNOWN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" seqname start end attribute \\\n",
"0 chr1 11869 14412 ID=ENSG00000223972.4;gene_id=ENSG00000223972.4... \n",
"1 chr1 14363 29806 ID=ENSG00000227232.4;gene_id=ENSG00000227232.4... \n",
"2 chr1 29554 31109 ID=ENSG00000243485.2;gene_id=ENSG00000243485.2... \n",
"3 chr1 34554 36081 ID=ENSG00000237613.2;gene_id=ENSG00000237613.2... \n",
"4 chr1 52473 54936 ID=ENSG00000268020.2;gene_id=ENSG00000268020.2... \n",
"\n",
" gene_name gene_type gene_status gene_level \n",
"0 DDX11L1 pseudogene KNOWN 2 \n",
"1 WASH7P pseudogene KNOWN 2 \n",
"2 MIR1302-11 lincRNA NOVEL 2 \n",
"3 FAM138A lincRNA KNOWN 2 \n",
"4 OR4G4P pseudogene KNOWN 2 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gencode_genes[\"gene_name\"], gencode_genes[\"gene_type\"], gencode_genes[\"gene_status\"], gencode_genes[\"gene_level\"] = zip(*gencode_genes.attribute.apply(lambda x: gene_info(x)))\n",
"gencode_genes.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 pseudogene\n",
"2 lincRNA\n",
"6 protein_coding\n",
"13 antisense\n",
"14 processed_transcript\n",
"15 snRNA\n",
"76 sense_intronic\n",
"82 miRNA\n",
"110 misc_RNA\n",
"254 snoRNA\n",
"315 rRNA\n",
"608 3prime_overlapping_ncrna\n",
"614 polymorphic_pseudogene\n",
"1274 sense_overlapping\n",
"6806 IG_V_gene\n",
"6849 IG_C_gene\n",
"6850 IG_J_gene\n",
"6857 IG_V_pseudogene\n",
"21480 TR_C_gene\n",
"21481 TR_J_gene\n",
"21487 TR_V_gene\n",
"21488 TR_V_pseudogene\n",
"26189 IG_C_pseudogene\n",
"38188 TR_D_gene\n",
"38198 TR_J_pseudogene\n",
"39997 IG_J_pseudogene\n",
"40004 IG_D_gene\n",
"57783 Mt_tRNA\n",
"57784 Mt_rRNA\n",
"Name: gene_type, dtype: object"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gencode_genes['gene_type'].drop_duplicates()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4. Extract all known protein_coding genes"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 19456 entries, 0 to 19455\n",
"Data columns (total 8 columns):\n",
"seqname 19456 non-null object\n",
"start 19456 non-null int64\n",
"end 19456 non-null int64\n",
"attribute 19456 non-null object\n",
"gene_name 19456 non-null object\n",
"gene_type 19456 non-null object\n",
"gene_status 19456 non-null object\n",
"gene_level 19456 non-null int64\n",
"dtypes: int64(3), object(5)\n",
"memory usage: 1.2+ MB\n"
]
}
],
"source": [
"gencode_genes = gencode_genes[gencode_genes['gene_status'] == 'KNOWN'].reset_index().drop('index', axis=1)\n",
"gencode_genes = gencode_genes[gencode_genes['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)\n",
"gencode_genes.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 5. Remove duplicates - Prioritize verified and manually annotated loci over automatically annotated loci\n",
"Gene level column can be used to prioritize genes when removing duplicates \n",
"1. verified loci\n",
"2. manually annotated loci\n",
"3. automatically annotated loci"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>seqname</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>attribute</th>\n",
" <th>gene_name</th>\n",
" <th>gene_type</th>\n",
" <th>gene_status</th>\n",
" <th>gene_level</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr1</td>\n",
" <td>1246965</td>\n",
" <td>1260071</td>\n",
" <td>ID=ENSG00000127054.14;gene_id=ENSG00000127054....</td>\n",
" <td>CPSF3L</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr1</td>\n",
" <td>1337288</td>\n",
" <td>1342693</td>\n",
" <td>ID=ENSG00000242485.1;gene_id=ENSG00000242485.1...</td>\n",
" <td>MRPL20</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr1</td>\n",
" <td>1353800</td>\n",
" <td>1357149</td>\n",
" <td>ID=ENSG00000235098.4;gene_id=ENSG00000235098.4...</td>\n",
" <td>ANKRD65</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr1</td>\n",
" <td>1550795</td>\n",
" <td>1565990</td>\n",
" <td>ID=ENSG00000197530.8;gene_id=ENSG00000197530.8...</td>\n",
" <td>MIB2</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr1</td>\n",
" <td>1716729</td>\n",
" <td>1822495</td>\n",
" <td>ID=ENSG00000078369.13;gene_id=ENSG00000078369....</td>\n",
" <td>GNB1</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" seqname start end \\\n",
"0 chr1 1246965 1260071 \n",
"1 chr1 1337288 1342693 \n",
"2 chr1 1353800 1357149 \n",
"3 chr1 1550795 1565990 \n",
"4 chr1 1716729 1822495 \n",
"\n",
" attribute gene_name \\\n",
"0 ID=ENSG00000127054.14;gene_id=ENSG00000127054.... CPSF3L \n",
"1 ID=ENSG00000242485.1;gene_id=ENSG00000242485.1... MRPL20 \n",
"2 ID=ENSG00000235098.4;gene_id=ENSG00000235098.4... ANKRD65 \n",
"3 ID=ENSG00000197530.8;gene_id=ENSG00000197530.8... MIB2 \n",
"4 ID=ENSG00000078369.13;gene_id=ENSG00000078369.... GNB1 \n",
"\n",
" gene_type gene_status gene_level \n",
"0 protein_coding KNOWN 1 \n",
"1 protein_coding KNOWN 1 \n",
"2 protein_coding KNOWN 1 \n",
"3 protein_coding KNOWN 1 \n",
"4 protein_coding KNOWN 1 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Sort gene_leve in each chromosome (ascending oder) and remove duplicates\n",
"gencode_genes = gencode_genes.sort_values(['gene_level', 'seqname'], ascending=True).drop_duplicates('gene_name', keep='first').reset_index().drop('index', axis=1) \n",
"gencode_genes.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 19410 entries, 0 to 19409\n",
"Data columns (total 8 columns):\n",
"seqname 19410 non-null object\n",
"start 19410 non-null int64\n",
"end 19410 non-null int64\n",
"attribute 19410 non-null object\n",
"gene_name 19410 non-null object\n",
"gene_type 19410 non-null object\n",
"gene_status 19410 non-null object\n",
"gene_level 19410 non-null int64\n",
"dtypes: int64(3), object(5)\n",
"memory usage: 1.2+ MB\n"
]
}
],
"source": [
"gencode_genes.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* Step 5 removed 46 records (19455-19409)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#### 6. Save the dataframe gencode_genes into a file and index it using Tabix"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"gencode_genes.to_csv('gencode.v19.annotation.gff3_all_known_genes.txt', index=False, header = False, sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 12632\n",
"-rw-r--r-- 1 pubudu staff 65214 24 Dec 01:36 01.Format_gencode_gff3.ipynb\n",
"-rw-r--r-- 1 pubudu staff 59 20 Dec 20:07 README.md\n",
"-rw-r--r-- 1 pubudu staff 6069939 24 Dec 01:36 gencode.v19.annotation.gff3_all_known_genes.txt\n",
"-rw-r--r-- 1 pubudu staff 264198 24 Dec 01:36 gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz\n",
"-rw-r--r-- 1 pubudu staff 61119 24 Dec 01:36 gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz.tbi\n"
]
}
],
"source": [
"%%bash -s gencode.v19.annotation.gff3_all_known_genes.txt\n",
"cut -f 1,2,3,5 $1 | sortBed -i > gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed\n",
"bgzip gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed\n",
"tabix -p bed gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz\n",
"ls -l"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Section B: Find genes that overlap with a set of genomic intervals"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1. A function to calculate overlapping basepairs"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def overlap(q_st, q_end, res_st, res_end):\n",
" o = min(q_end, res_end)-max(q_st, res_st)\n",
" return o"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"chr1\t127569438\t127669438\t25\t100000\t-\n",
"chr1\t194793911\t194893911\t16\t100000\t+\n",
"chr10\t77847895\t77947895\t44\t100000\t-\n",
"chr10\t101647085\t101747085\t30\t100000\t+\n",
"chr11\t50056632\t50156632\t10\t100000\t-\n",
"chr11\t120313124\t120413124\t37\t100000\t-\n",
"chr12\t130593409\t130693409\t4\t100000\t-\n",
"chr13\t15142062\t15242062\t50\t100000\t+\n",
"chr13\t22210089\t22310089\t14\t100000\t-\n",
"chr13\t99558514\t99658514\t45\t100000\t-\n"
]
}
],
"source": [
"### Create a random .bed file\n",
"!bedtools random -l 100000 -n 50 -g /Users/pubudu/Documents/RefData/hg19/human.hg19.genome | sortBed -i stdin > sample.bed\n",
"!head sample.bed"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chr</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>name</th>\n",
" <th>score</th>\n",
" <th>strand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr1</td>\n",
" <td>127569438</td>\n",
" <td>127669438</td>\n",
" <td>25</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr1</td>\n",
" <td>194793911</td>\n",
" <td>194893911</td>\n",
" <td>16</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr10</td>\n",
" <td>77847895</td>\n",
" <td>77947895</td>\n",
" <td>44</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr10</td>\n",
" <td>101647085</td>\n",
" <td>101747085</td>\n",
" <td>30</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr11</td>\n",
" <td>50056632</td>\n",
" <td>50156632</td>\n",
" <td>10</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chr start end name score strand\n",
"0 chr1 127569438 127669438 25 100000 -\n",
"1 chr1 194793911 194893911 16 100000 +\n",
"2 chr10 77847895 77947895 44 100000 -\n",
"3 chr10 101647085 101747085 30 100000 +\n",
"4 chr11 50056632 50156632 10 100000 -"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"### Read the sample file in to a pandas dataframe\n",
"df = pd.read_table(\"sample.bed\", names=['chr', 'start', 'end', 'name', 'score', 'strand'])\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2. A function to find overlapping genes in tabix index file"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def gencode_all_known_genes(a, tb):\n",
" genes = []\n",
"\n",
" try:\n",
" for region in tb.fetch(a['chr'], int(a['start']), int(a['end'])):\n",
" if region:\n",
" r = region.split('\\t')\n",
" overlap_len = overlap(int(a['start']), int(a['end']), int(r[1]), int(r[2]))\n",
" ret_val = '{}({})'.format(r[3], np.round(overlap_len/float(int(a['end'])-int(a['start']))*100, 2)) ### Percentage of the input interval that overlap with the gene\n",
" genes.append(ret_val) \n",
"\n",
" if len(genes)>0:\n",
" return \";\".join(genes)\n",
" else:\n",
" return \"NA(0)\"\n",
" except ValueError:\n",
" return \"NA(0)\"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import pysam\n",
"gencode_v19 = pysam.TabixFile('gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz')\n",
"\n",
"df['genes'] = df.apply(lambda x: gencode_all_known_genes(x[['chr', 'start', 'end']], gencode_v19), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chr</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>name</th>\n",
" <th>score</th>\n",
" <th>strand</th>\n",
" <th>genes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr1</td>\n",
" <td>127569438</td>\n",
" <td>127669438</td>\n",
" <td>25</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>NA(0)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr1</td>\n",
" <td>194793911</td>\n",
" <td>194893911</td>\n",
" <td>16</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" <td>NA(0)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr10</td>\n",
" <td>77847895</td>\n",
" <td>77947895</td>\n",
" <td>44</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>C10orf11(100.0)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr10</td>\n",
" <td>101647085</td>\n",
" <td>101747085</td>\n",
" <td>30</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" <td>DNMBP(100.0)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr11</td>\n",
" <td>50056632</td>\n",
" <td>50156632</td>\n",
" <td>10</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>NA(0)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chr start end name score strand genes\n",
"0 chr1 127569438 127669438 25 100000 - NA(0)\n",
"1 chr1 194793911 194893911 16 100000 + NA(0)\n",
"2 chr10 77847895 77947895 44 100000 - C10orf11(100.0)\n",
"3 chr10 101647085 101747085 30 100000 + DNMBP(100.0)\n",
"4 chr11 50056632 50156632 10 100000 - NA(0)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 50 entries, 0 to 49\n",
"Data columns (total 7 columns):\n",
"chr 50 non-null object\n",
"start 50 non-null int64\n",
"end 50 non-null int64\n",
"name 50 non-null int64\n",
"score 50 non-null int64\n",
"strand 50 non-null object\n",
"genes 50 non-null object\n",
"dtypes: int64(4), object(3)\n",
"memory usage: 2.8+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### When an input interval overlaps with multiple genes, \"genes\" column will list all those genes (separated by \";\")\n",
"* If you need to have a single row for each gene, use the following code to transform the dataframe - df"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 30 entries, 0 to 29\n",
"Data columns (total 7 columns):\n",
"chr 30 non-null object\n",
"start 30 non-null int64\n",
"end 30 non-null int64\n",
"name 30 non-null int64\n",
"score 30 non-null int64\n",
"strand 30 non-null object\n",
"genes 30 non-null object\n",
"dtypes: int64(4), object(3)\n",
"memory usage: 1.7+ KB\n"
]
}
],
"source": [
"## Remove all the intervals that do not overlap with genes\n",
"df = df[df['genes'] != \"NA(0)\"].reset_index(drop=True)\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"new_rows = []\n",
"for i,r in df.iterrows():\n",
" g_list = r['genes'].split(\";\")\n",
" for g in g_list:\n",
" g = g.replace(\" \",\"\")\n",
" new_rows.append(np.append(r[['chr', 'start', 'end', 'name', 'score', 'strand', 'genes']].values, g))\n",
" \n",
"df_perGene = pd.DataFrame()\n",
"df_perGene = df_perGene.append(pd.DataFrame(new_rows, columns=['chr', 'start', 'end', 'name', 'score', 'strand', 'genes', 'gene_ID'])).reset_index().drop('index', axis=1)\n",
"\n",
"df_perGene['gene_name'] = df_perGene['gene_ID'].apply(lambda x: x.split(\"(\")[0])\n",
"df_perGene['gene_coverage'] = df_perGene['gene_ID'].apply(lambda x: x.split(\"(\")[1].replace(\")\", \"\"))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chr</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>name</th>\n",
" <th>score</th>\n",
" <th>strand</th>\n",
" <th>genes</th>\n",
" <th>gene_ID</th>\n",
" <th>gene_name</th>\n",
" <th>gene_coverage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr10</td>\n",
" <td>77847895</td>\n",
" <td>77947895</td>\n",
" <td>44</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>C10orf11(100.0)</td>\n",
" <td>C10orf11(100.0)</td>\n",
" <td>C10orf11</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr10</td>\n",
" <td>101647085</td>\n",
" <td>101747085</td>\n",
" <td>30</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" <td>DNMBP(100.0)</td>\n",
" <td>DNMBP(100.0)</td>\n",
" <td>DNMBP</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>ARHGEF12(47.52);GRIK4(30.66);AP002348.1(3.22)</td>\n",
" <td>ARHGEF12(47.52)</td>\n",
" <td>ARHGEF12</td>\n",
" <td>47.52</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>ARHGEF12(47.52);GRIK4(30.66);AP002348.1(3.22)</td>\n",
" <td>GRIK4(30.66)</td>\n",
" <td>GRIK4</td>\n",
" <td>30.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>ARHGEF12(47.52);GRIK4(30.66);AP002348.1(3.22)</td>\n",
" <td>AP002348.1(3.22)</td>\n",
" <td>AP002348.1</td>\n",
" <td>3.22</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chr start end name score strand \\\n",
"0 chr10 77847895 77947895 44 100000 - \n",
"1 chr10 101647085 101747085 30 100000 + \n",
"2 chr11 120313124 120413124 37 100000 - \n",
"3 chr11 120313124 120413124 37 100000 - \n",
"4 chr11 120313124 120413124 37 100000 - \n",
"\n",
" genes gene_ID \\\n",
"0 C10orf11(100.0) C10orf11(100.0) \n",
"1 DNMBP(100.0) DNMBP(100.0) \n",
"2 ARHGEF12(47.52);GRIK4(30.66);AP002348.1(3.22) ARHGEF12(47.52) \n",
"3 ARHGEF12(47.52);GRIK4(30.66);AP002348.1(3.22) GRIK4(30.66) \n",
"4 ARHGEF12(47.52);GRIK4(30.66);AP002348.1(3.22) AP002348.1(3.22) \n",
"\n",
" gene_name gene_coverage \n",
"0 C10orf11 100.0 \n",
"1 DNMBP 100.0 \n",
"2 ARHGEF12 47.52 \n",
"3 GRIK4 30.66 \n",
"4 AP002348.1 3.22 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_perGene.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chr</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>name</th>\n",
" <th>score</th>\n",
" <th>strand</th>\n",
" <th>gene_name</th>\n",
" <th>gene_coverage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr10</td>\n",
" <td>77847895</td>\n",
" <td>77947895</td>\n",
" <td>44</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>C10orf11</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr10</td>\n",
" <td>101647085</td>\n",
" <td>101747085</td>\n",
" <td>30</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" <td>DNMBP</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>ARHGEF12</td>\n",
" <td>47.52</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>GRIK4</td>\n",
" <td>30.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>AP002348.1</td>\n",
" <td>3.22</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chr start end name score strand gene_name gene_coverage\n",
"0 chr10 77847895 77947895 44 100000 - C10orf11 100.0\n",
"1 chr10 101647085 101747085 30 100000 + DNMBP 100.0\n",
"2 chr11 120313124 120413124 37 100000 - ARHGEF12 47.52\n",
"3 chr11 120313124 120413124 37 100000 - GRIK4 30.66\n",
"4 chr11 120313124 120413124 37 100000 - AP002348.1 3.22"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## drop the genes column\n",
"df_perGene = df_perGene.drop([\"genes\", \"gene_ID\"], axis=1)\n",
"df_perGene.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 41 entries, 0 to 40\n",
"Data columns (total 8 columns):\n",
"chr 41 non-null object\n",
"start 41 non-null int64\n",
"end 41 non-null int64\n",
"name 41 non-null int64\n",
"score 41 non-null int64\n",
"strand 41 non-null object\n",
"gene_name 41 non-null object\n",
"gene_coverage 41 non-null object\n",
"dtypes: int64(4), object(4)\n",
"memory usage: 2.6+ KB\n"
]
}
],
"source": [
"df_perGene.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Section C: Find coordinates of a gene list"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>seqname</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>attribute</th>\n",
" <th>gene_name</th>\n",
" <th>gene_type</th>\n",
" <th>gene_status</th>\n",
" <th>gene_level</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr1</td>\n",
" <td>1246965</td>\n",
" <td>1260071</td>\n",
" <td>ID=ENSG00000127054.14;gene_id=ENSG00000127054....</td>\n",
" <td>CPSF3L</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr1</td>\n",
" <td>1337288</td>\n",
" <td>1342693</td>\n",
" <td>ID=ENSG00000242485.1;gene_id=ENSG00000242485.1...</td>\n",
" <td>MRPL20</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr1</td>\n",
" <td>1353800</td>\n",
" <td>1357149</td>\n",
" <td>ID=ENSG00000235098.4;gene_id=ENSG00000235098.4...</td>\n",
" <td>ANKRD65</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr1</td>\n",
" <td>1550795</td>\n",
" <td>1565990</td>\n",
" <td>ID=ENSG00000197530.8;gene_id=ENSG00000197530.8...</td>\n",
" <td>MIB2</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr1</td>\n",
" <td>1716729</td>\n",
" <td>1822495</td>\n",
" <td>ID=ENSG00000078369.13;gene_id=ENSG00000078369....</td>\n",
" <td>GNB1</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" seqname start end \\\n",
"0 chr1 1246965 1260071 \n",
"1 chr1 1337288 1342693 \n",
"2 chr1 1353800 1357149 \n",
"3 chr1 1550795 1565990 \n",
"4 chr1 1716729 1822495 \n",
"\n",
" attribute gene_name \\\n",
"0 ID=ENSG00000127054.14;gene_id=ENSG00000127054.... CPSF3L \n",
"1 ID=ENSG00000242485.1;gene_id=ENSG00000242485.1... MRPL20 \n",
"2 ID=ENSG00000235098.4;gene_id=ENSG00000235098.4... ANKRD65 \n",
"3 ID=ENSG00000197530.8;gene_id=ENSG00000197530.8... MIB2 \n",
"4 ID=ENSG00000078369.13;gene_id=ENSG00000078369.... GNB1 \n",
"\n",
" gene_type gene_status gene_level \n",
"0 protein_coding KNOWN 1 \n",
"1 protein_coding KNOWN 1 \n",
"2 protein_coding KNOWN 1 \n",
"3 protein_coding KNOWN 1 \n",
"4 protein_coding KNOWN 1 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gencode_genes.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>seqname</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>attribute</th>\n",
" <th>gene_type</th>\n",
" <th>gene_status</th>\n",
" <th>gene_level</th>\n",
" </tr>\n",
" <tr>\n",
" <th>gene_name</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>CPSF3L</th>\n",
" <td>chr1</td>\n",
" <td>1246965</td>\n",
" <td>1260071</td>\n",
" <td>ID=ENSG00000127054.14;gene_id=ENSG00000127054....</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MRPL20</th>\n",
" <td>chr1</td>\n",
" <td>1337288</td>\n",
" <td>1342693</td>\n",
" <td>ID=ENSG00000242485.1;gene_id=ENSG00000242485.1...</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ANKRD65</th>\n",
" <td>chr1</td>\n",
" <td>1353800</td>\n",
" <td>1357149</td>\n",
" <td>ID=ENSG00000235098.4;gene_id=ENSG00000235098.4...</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MIB2</th>\n",
" <td>chr1</td>\n",
" <td>1550795</td>\n",
" <td>1565990</td>\n",
" <td>ID=ENSG00000197530.8;gene_id=ENSG00000197530.8...</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>GNB1</th>\n",
" <td>chr1</td>\n",
" <td>1716729</td>\n",
" <td>1822495</td>\n",
" <td>ID=ENSG00000078369.13;gene_id=ENSG00000078369....</td>\n",
" <td>protein_coding</td>\n",
" <td>KNOWN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" seqname start end \\\n",
"gene_name \n",
"CPSF3L chr1 1246965 1260071 \n",
"MRPL20 chr1 1337288 1342693 \n",
"ANKRD65 chr1 1353800 1357149 \n",
"MIB2 chr1 1550795 1565990 \n",
"GNB1 chr1 1716729 1822495 \n",
"\n",
" attribute gene_type \\\n",
"gene_name \n",
"CPSF3L ID=ENSG00000127054.14;gene_id=ENSG00000127054.... protein_coding \n",
"MRPL20 ID=ENSG00000242485.1;gene_id=ENSG00000242485.1... protein_coding \n",
"ANKRD65 ID=ENSG00000235098.4;gene_id=ENSG00000235098.4... protein_coding \n",
"MIB2 ID=ENSG00000197530.8;gene_id=ENSG00000197530.8... protein_coding \n",
"GNB1 ID=ENSG00000078369.13;gene_id=ENSG00000078369.... protein_coding \n",
"\n",
" gene_status gene_level \n",
"gene_name \n",
"CPSF3L KNOWN 1 \n",
"MRPL20 KNOWN 1 \n",
"ANKRD65 KNOWN 1 \n",
"MIB2 KNOWN 1 \n",
"GNB1 KNOWN 1 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gencode_genes = gencode_genes.set_index('gene_name')\n",
"gencode_genes.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def fetch_gene_coords(g):\n",
"\n",
" if gencode_genes.index.contains(g): \n",
" return gencode_genes.loc[g]['seqname'], gencode_genes.loc[g]['start'], gencode_genes.loc[g]['end'] #gencode_genes.loc[g][['seqname', 'start', 'end']]\n",
" else:\n",
" return \"NA\", \"NA\", \"NA\""
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chr</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>name</th>\n",
" <th>score</th>\n",
" <th>strand</th>\n",
" <th>gene_name</th>\n",
" <th>gene_coverage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr10</td>\n",
" <td>77847895</td>\n",
" <td>77947895</td>\n",
" <td>44</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>C10orf11</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr10</td>\n",
" <td>101647085</td>\n",
" <td>101747085</td>\n",
" <td>30</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" <td>DNMBP</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>ARHGEF12</td>\n",
" <td>47.52</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>GRIK4</td>\n",
" <td>30.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>AP002348.1</td>\n",
" <td>3.22</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chr start end name score strand gene_name gene_coverage\n",
"0 chr10 77847895 77947895 44 100000 - C10orf11 100.0\n",
"1 chr10 101647085 101747085 30 100000 + DNMBP 100.0\n",
"2 chr11 120313124 120413124 37 100000 - ARHGEF12 47.52\n",
"3 chr11 120313124 120413124 37 100000 - GRIK4 30.66\n",
"4 chr11 120313124 120413124 37 100000 - AP002348.1 3.22"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_perGene.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 24.6 ms, sys: 1.98 ms, total: 26.6 ms\n",
"Wall time: 25.1 ms\n"
]
}
],
"source": [
"time df_perGene['g_chr'], df_perGene['g_start'], df_perGene['g_end'] = zip(*df_perGene['gene_name'].apply(lambda x: fetch_gene_coords(x)))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chr</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>name</th>\n",
" <th>score</th>\n",
" <th>strand</th>\n",
" <th>gene_name</th>\n",
" <th>gene_coverage</th>\n",
" <th>g_chr</th>\n",
" <th>g_start</th>\n",
" <th>g_end</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>chr10</td>\n",
" <td>77847895</td>\n",
" <td>77947895</td>\n",
" <td>44</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>C10orf11</td>\n",
" <td>100.0</td>\n",
" <td>chr10</td>\n",
" <td>77360998</td>\n",
" <td>78319925</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>chr10</td>\n",
" <td>101647085</td>\n",
" <td>101747085</td>\n",
" <td>30</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" <td>DNMBP</td>\n",
" <td>100.0</td>\n",
" <td>chr10</td>\n",
" <td>101635334</td>\n",
" <td>101769676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>ARHGEF12</td>\n",
" <td>47.52</td>\n",
" <td>chr11</td>\n",
" <td>120207787</td>\n",
" <td>120360645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>GRIK4</td>\n",
" <td>30.66</td>\n",
" <td>chr11</td>\n",
" <td>120382468</td>\n",
" <td>120859613</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>chr11</td>\n",
" <td>120313124</td>\n",
" <td>120413124</td>\n",
" <td>37</td>\n",
" <td>100000</td>\n",
" <td>-</td>\n",
" <td>AP002348.1</td>\n",
" <td>3.22</td>\n",
" <td>chr11</td>\n",
" <td>120382511</td>\n",
" <td>120385732</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chr start end name score strand gene_name gene_coverage \\\n",
"0 chr10 77847895 77947895 44 100000 - C10orf11 100.0 \n",
"1 chr10 101647085 101747085 30 100000 + DNMBP 100.0 \n",
"2 chr11 120313124 120413124 37 100000 - ARHGEF12 47.52 \n",
"3 chr11 120313124 120413124 37 100000 - GRIK4 30.66 \n",
"4 chr11 120313124 120413124 37 100000 - AP002348.1 3.22 \n",
"\n",
" g_chr g_start g_end \n",
"0 chr10 77360998 78319925 \n",
"1 chr10 101635334 101769676 \n",
"2 chr11 120207787 120360645 \n",
"3 chr11 120382468 120859613 \n",
"4 chr11 120382511 120385732 "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_perGene.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (py36)",
"language": "python",
"name": "py36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment