Skip to content

Instantly share code, notes, and snippets.

@simecek
Created September 20, 2021 22:02
Show Gist options
  • Save simecek/8ba4163547c929c53a509d975b85936e to your computer and use it in GitHub Desktop.
Save simecek/8ba4163547c929c53a509d975b85936e to your computer and use it in GitHub Desktop.
Parsing FASTA with BioPython
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import gzip\n",
"import re\n",
"from tqdm.notebook import tqdm\n",
"\n",
"from Bio import SeqIO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get list of intervals\n",
"\n",
"CSV tables with intervals can be found at https://github.com/ML-Bioinfo-CEITEC/genomic_benchmarks/tree/main/datasets/demo_coding_vs_intergenomic_seqs/train"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>region</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>strand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>27434</td>\n",
" <td>chr2</td>\n",
" <td>56478034</td>\n",
" <td>56478234</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13400</td>\n",
" <td>chr21</td>\n",
" <td>20427531</td>\n",
" <td>20427731</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>883</td>\n",
" <td>chr17</td>\n",
" <td>73975959</td>\n",
" <td>73976159</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7303</td>\n",
" <td>chr13</td>\n",
" <td>45730273</td>\n",
" <td>45730473</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45124</td>\n",
" <td>chr10</td>\n",
" <td>38820750</td>\n",
" <td>38820950</td>\n",
" <td>+</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id region start end strand\n",
"0 27434 chr2 56478034 56478234 +\n",
"1 13400 chr21 20427531 20427731 +\n",
"2 883 chr17 73975959 73976159 +\n",
"3 7303 chr13 45730273 45730473 +\n",
"4 45124 chr10 38820750 38820950 +"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"../../datasets/demo_coding_vs_intergenomic_seqs/train/intergenomic_seqs.csv\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>region</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>strand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>27434</td>\n",
" <td>ENST00000217185</td>\n",
" <td>203</td>\n",
" <td>403</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13400</td>\n",
" <td>ENST00000367051</td>\n",
" <td>4701</td>\n",
" <td>4901</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>883</td>\n",
" <td>ENST00000360507</td>\n",
" <td>623</td>\n",
" <td>823</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7303</td>\n",
" <td>ENST00000489432</td>\n",
" <td>1469</td>\n",
" <td>1669</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45124</td>\n",
" <td>ENST00000309668</td>\n",
" <td>64</td>\n",
" <td>264</td>\n",
" <td>+</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id region start end strand\n",
"0 27434 ENST00000217185 203 403 +\n",
"1 13400 ENST00000367051 4701 4901 +\n",
"2 883 ENST00000360507 623 823 +\n",
"3 7303 ENST00000489432 1469 1669 +\n",
"4 45124 ENST00000309668 64 264 +"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = pd.read_csv(\"../../datasets/demo_coding_vs_intergenomic_seqs/train/coding_seqs.csv\")\n",
"df2.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get genome and transcriptome\n",
"\n",
"We are using release 97 of Ensembl (Human): http://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"GENOME_PATH = \"../../refs/Homo_sapiens.GRCh38.dna.toplevel.fa.gz\"\n",
"TRANSCRIPTOME_PATH = \"../../refs/Homo_sapiens.GRCh38.cdna.all.fa.gz\"\n",
"\n",
"def fasta2dict(fasta_past, fasta_total=None, stop_id=None, region_name_transform=lambda x: x):\n",
" fasta = dict()\n",
" \n",
" with gzip.open(fasta_past, \"rt\") as handle:\n",
" for record in tqdm(SeqIO.parse(handle, \"fasta\"), total=fasta_total):\n",
" fasta[region_name_transform(record.id)] = str(record.seq)\n",
" \n",
" if stop_id and (record.id == stop_id): \n",
" # stop, do not read small contigs\n",
" break\n",
" \n",
" return fasta"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9c7500bfb47145d29df2dc861d4e93f5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/190000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6.44 s, sys: 137 ms, total: 6.58 s\n",
"Wall time: 6.54 s\n"
]
}
],
"source": [
"%%time\n",
"transcriptome = fasta2dict(TRANSCRIPTOME_PATH, 190_000, region_name_transform=lambda x: re.sub(\"ENST([0-9]*)[.][0-9]*\", \"ENST\\\\1\", x))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4b0dac41d80d4b60bdf2654209072298",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/24 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 35 s, sys: 3.48 s, total: 38.5 s\n",
"Wall time: 38.5 s\n"
]
}
],
"source": [
"%%time\n",
"genome = fasta2dict(GENOME_PATH, 24, \"MT\", region_name_transform=lambda x: \"chr\"+x)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# all interval regions found?\n",
"all([region in genome for region in df['region']])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all([t in transcriptome for t in df2['region']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find sequences"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def seq_finder(ref, region, start, end):\n",
" return ref[region][start:end]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'AGAAATTTGTCTATTTCATCTAAGTTATCAAATTTCTTGTTGTACAGTTTATAGTACTCCTTTTTAATCCTTTTTATTTCCATGAGATCAAAAATGACATCTTTCTTTCACTTCTGACTTTAGTAATATGCATTCTCTCTCTCTCTTATTAGTCAGTTTAGGTAAAGGTTTTTCAATTTTGCTGACCTTTTCAAAGATCT'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seq_finder(genome, df['region'].iloc[0], df['start'].iloc[0], df['end'].iloc[0])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def seq_finder_tab(ref, tab):\n",
" return pd.Series([ref[region][start:end] for region, start, end in zip(tab['region'], tab['start'], tab['end'])])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 28.3 ms, sys: 4.03 ms, total: 32.3 ms\n",
"Wall time: 31.4 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>region</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>strand</th>\n",
" <th>seq</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>27434</td>\n",
" <td>chr2</td>\n",
" <td>56478034</td>\n",
" <td>56478234</td>\n",
" <td>+</td>\n",
" <td>AGAAATTTGTCTATTTCATCTAAGTTATCAAATTTCTTGTTGTACA...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13400</td>\n",
" <td>chr21</td>\n",
" <td>20427531</td>\n",
" <td>20427731</td>\n",
" <td>+</td>\n",
" <td>GAGAGAGGAAGCAAGAGAGAGACGCAAATGCTGCCAGAACTTTTTA...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>883</td>\n",
" <td>chr17</td>\n",
" <td>73975959</td>\n",
" <td>73976159</td>\n",
" <td>+</td>\n",
" <td>TCCTTCCCAAGAGGTCTTCTGTGAGCCCACCCAACGAAGTGATGGT...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7303</td>\n",
" <td>chr13</td>\n",
" <td>45730273</td>\n",
" <td>45730473</td>\n",
" <td>+</td>\n",
" <td>CTCAGGCTGCTTCTCCAAGTCAGTGCAGAAGTTAAGGGGAGCACCT...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45124</td>\n",
" <td>chr10</td>\n",
" <td>38820750</td>\n",
" <td>38820950</td>\n",
" <td>+</td>\n",
" <td>ATAGTATGGAATGGAATCGAATGGAATTGAATCGAATAGAATTGGC...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id region start end strand \\\n",
"0 27434 chr2 56478034 56478234 + \n",
"1 13400 chr21 20427531 20427731 + \n",
"2 883 chr17 73975959 73976159 + \n",
"3 7303 chr13 45730273 45730473 + \n",
"4 45124 chr10 38820750 38820950 + \n",
"\n",
" seq \n",
"0 AGAAATTTGTCTATTTCATCTAAGTTATCAAATTTCTTGTTGTACA... \n",
"1 GAGAGAGGAAGCAAGAGAGAGACGCAAATGCTGCCAGAACTTTTTA... \n",
"2 TCCTTCCCAAGAGGTCTTCTGTGAGCCCACCCAACGAAGTGATGGT... \n",
"3 CTCAGGCTGCTTCTCCAAGTCAGTGCAGAAGTTAAGGGGAGCACCT... \n",
"4 ATAGTATGGAATGGAATCGAATGGAATTGAATCGAATAGAATTGGC... "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"df['seq'] = seq_finder_tab(genome, df)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 46.6 ms, sys: 276 µs, total: 46.9 ms\n",
"Wall time: 45.8 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>region</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>strand</th>\n",
" <th>seq</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>27434</td>\n",
" <td>ENST00000217185</td>\n",
" <td>203</td>\n",
" <td>403</td>\n",
" <td>+</td>\n",
" <td>CTGGACGAGGCGGGTGGGGCCGTGGCCCAGGGCTATGTGCCCCACA...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13400</td>\n",
" <td>ENST00000367051</td>\n",
" <td>4701</td>\n",
" <td>4901</td>\n",
" <td>+</td>\n",
" <td>GCTTGTGGGAGAACGGTCAATATATTGCACCAGCAAAGATGATCAA...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>883</td>\n",
" <td>ENST00000360507</td>\n",
" <td>623</td>\n",
" <td>823</td>\n",
" <td>+</td>\n",
" <td>CTGACCCTCGGGAGATCTTTGAGGTGCTGAGCTGGTTGGAGAGCTG...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7303</td>\n",
" <td>ENST00000489432</td>\n",
" <td>1469</td>\n",
" <td>1669</td>\n",
" <td>+</td>\n",
" <td>CTCCCTCTACAGCTGCGACGACTGCGGCAGGAGCTTCCGGCTGGAG...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45124</td>\n",
" <td>ENST00000309668</td>\n",
" <td>64</td>\n",
" <td>264</td>\n",
" <td>+</td>\n",
" <td>CCCTGAGTCTGTATTGCTCAAGAAGGGCCTTCCCCAGCAATGACCT...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id region start end strand \\\n",
"0 27434 ENST00000217185 203 403 + \n",
"1 13400 ENST00000367051 4701 4901 + \n",
"2 883 ENST00000360507 623 823 + \n",
"3 7303 ENST00000489432 1469 1669 + \n",
"4 45124 ENST00000309668 64 264 + \n",
"\n",
" seq \n",
"0 CTGGACGAGGCGGGTGGGGCCGTGGCCCAGGGCTATGTGCCCCACA... \n",
"1 GCTTGTGGGAGAACGGTCAATATATTGCACCAGCAAAGATGATCAA... \n",
"2 CTGACCCTCGGGAGATCTTTGAGGTGCTGAGCTGGTTGGAGAGCTG... \n",
"3 CTCCCTCTACAGCTGCGACGACTGCGGCAGGAGCTTCCGGCTGGAG... \n",
"4 CCCTGAGTCTGTATTGCTCAAGAAGGGCCTTCCCCAGCAATGACCT... "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"df2['seq'] = seq_finder_tab(transcriptome, df2)\n",
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment