Skip to content

Instantly share code, notes, and snippets.

@victorlin
Created April 5, 2020 04:04
Show Gist options
  • Save victorlin/a9704d74f03bcbf35cc3e03e68c4b0b5 to your computer and use it in GitHub Desktop.
Save victorlin/a9704d74f03bcbf35cc3e03e68c4b0b5 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from Bio import Entrez\n",
"Entrez.email = \"<>@gmail.com\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### search for all SRA samples"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2749 experiments\n"
]
}
],
"source": [
"# Coronaviridae - https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=11118\n",
"sra_search_term = \"txid11118[Organism:exp]\"\n",
"with Entrez.esearch(db=\"sra\", term=sra_search_term, retmax=3000) as handle_betacov:\n",
" record = Entrez.read(handle_betacov)\n",
" num_experiments = int(record['Count'])\n",
" uid_list = record['IdList']\n",
" print(f'{num_experiments} experiments')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fetching record 1 to 100\n",
"Fetching record 101 to 200\n",
"Fetching record 201 to 300\n",
"Fetching record 301 to 400\n",
"Fetching record 401 to 500\n",
"Fetching record 501 to 600\n",
"Fetching record 601 to 700\n",
"Fetching record 701 to 800\n",
"Fetching record 801 to 900\n",
"Fetching record 901 to 1000\n",
"Fetching record 1001 to 1100\n",
"Fetching record 1101 to 1200\n",
"Fetching record 1201 to 1300\n",
"Fetching record 1301 to 1400\n",
"Fetching record 1401 to 1500\n",
"Fetching record 1501 to 1600\n",
"Fetching record 1601 to 1700\n",
"Fetching record 1701 to 1800\n",
"Fetching record 1801 to 1900\n",
"Fetching record 1901 to 2000\n",
"Fetching record 2001 to 2100\n",
"Fetching record 2101 to 2200\n",
"Fetching record 2201 to 2300\n",
"Fetching record 2301 to 2400\n",
"Fetching record 2401 to 2500\n",
"Fetching record 2501 to 2600\n",
"Fetching record 2601 to 2700\n",
"Fetching record 2701 to 2749\n"
]
}
],
"source": [
"batch_size = 100\n",
"experiments = []\n",
"for start in range(0, num_experiments, batch_size):\n",
" end = min(num_experiments, start + batch_size)\n",
" print(f\"Fetching record {start + 1} to {end}\")\n",
" uids = uid_list[start:end]\n",
" with Entrez.esummary(db=\"sra\", id=','.join(uids)) as handle_esummary:\n",
" data = Entrez.read(handle_esummary)\n",
" experiments.extend(data)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_experiment_id(exp_xml):\n",
" idx_exp_tag = exp_xml.find('<Experiment')\n",
" start = idx_exp_tag + 17\n",
" end = idx_exp_tag + 27\n",
" return exp_xml[start:end]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"exp_ids = [get_experiment_id(exp['ExpXml']) for exp in experiments]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pysradb import SRAweb\n",
"db = SRAweb()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>study_accession</th>\n",
" <th>experiment_accession</th>\n",
" <th>experiment_title</th>\n",
" <th>experiment_desc</th>\n",
" <th>organism_taxid</th>\n",
" <th>organism_name</th>\n",
" <th>library_strategy</th>\n",
" <th>library_source</th>\n",
" <th>library_selection</th>\n",
" <th>sample_accession</th>\n",
" <th>sample_title</th>\n",
" <th>instrument</th>\n",
" <th>total_spots</th>\n",
" <th>total_size</th>\n",
" <th>run_accession</th>\n",
" <th>run_total_spots</th>\n",
" <th>run_total_bases</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ERP001119</td>\n",
" <td>ERX207938</td>\n",
" <td></td>\n",
" <td>Metagenomics of Betacoronavirus</td>\n",
" <td>694002</td>\n",
" <td>Betacoronavirus</td>\n",
" <td>AMPLICON</td>\n",
" <td>GENOMIC</td>\n",
" <td>PCR</td>\n",
" <td>ERS184553</td>\n",
" <td></td>\n",
" <td>Illumina MiSeq</td>\n",
" <td>6555345</td>\n",
" <td>1248079055</td>\n",
" <td>ERR233433</td>\n",
" <td>6555345</td>\n",
" <td>1966603500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>SRP254080</td>\n",
" <td>SRX8004780</td>\n",
" <td>RNA-Seq of recombinant SARS-CoV-2: nucleotides...</td>\n",
" <td>RNA-Seq of recombinant SARS-CoV-2: nucleotides...</td>\n",
" <td>2697049</td>\n",
" <td>Severe acute respiratory syndrome coronavirus 2</td>\n",
" <td>RNA-Seq</td>\n",
" <td>TRANSCRIPTOMIC</td>\n",
" <td>RANDOM</td>\n",
" <td>SRS6378608</td>\n",
" <td></td>\n",
" <td>Illumina NovaSeq 6000</td>\n",
" <td>63592467</td>\n",
" <td>6219772044</td>\n",
" <td>SRR11426414</td>\n",
" <td>63592467</td>\n",
" <td>18753570814</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>SRP254080</td>\n",
" <td>SRX8004779</td>\n",
" <td>RNA-Seq of recombinant SARS-CoV-2: SARS-CoV-mo...</td>\n",
" <td>RNA-Seq of recombinant SARS-CoV-2: SARS-CoV-mo...</td>\n",
" <td>2697049</td>\n",
" <td>Severe acute respiratory syndrome coronavirus 2</td>\n",
" <td>RNA-Seq</td>\n",
" <td>TRANSCRIPTOMIC</td>\n",
" <td>RANDOM</td>\n",
" <td>SRS6378607</td>\n",
" <td></td>\n",
" <td>Illumina NovaSeq 6000</td>\n",
" <td>65892888</td>\n",
" <td>6384922546</td>\n",
" <td>SRR11426415</td>\n",
" <td>65892888</td>\n",
" <td>19482216314</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>SRP254080</td>\n",
" <td>SRX8004778</td>\n",
" <td>RNA-Seq of recombinant SARS-CoV-2: GFP in ORF7a</td>\n",
" <td>RNA-Seq of recombinant SARS-CoV-2: GFP in ORF7a</td>\n",
" <td>2697049</td>\n",
" <td>Severe acute respiratory syndrome coronavirus 2</td>\n",
" <td>RNA-Seq</td>\n",
" <td>TRANSCRIPTOMIC</td>\n",
" <td>RANDOM</td>\n",
" <td>SRS6378606</td>\n",
" <td></td>\n",
" <td>Illumina NovaSeq 6000</td>\n",
" <td>68805012</td>\n",
" <td>6711704333</td>\n",
" <td>SRR11426416</td>\n",
" <td>68805012</td>\n",
" <td>20332189030</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>SRP254080</td>\n",
" <td>SRX8004777</td>\n",
" <td>RNA-Seq of recombinant SARS-CoV-2: nucleotides...</td>\n",
" <td>RNA-Seq of recombinant SARS-CoV-2: nucleotides...</td>\n",
" <td>2697049</td>\n",
" <td>Severe acute respiratory syndrome coronavirus 2</td>\n",
" <td>RNA-Seq</td>\n",
" <td>TRANSCRIPTOMIC</td>\n",
" <td>RANDOM</td>\n",
" <td>SRS6378605</td>\n",
" <td></td>\n",
" <td>Illumina NovaSeq 6000</td>\n",
" <td>72726462</td>\n",
" <td>7001420132</td>\n",
" <td>SRR11426417</td>\n",
" <td>72726462</td>\n",
" <td>21378281198</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" study_accession experiment_accession \\\n",
"0 ERP001119 ERX207938 \n",
"1 SRP254080 SRX8004780 \n",
"2 SRP254080 SRX8004779 \n",
"3 SRP254080 SRX8004778 \n",
"4 SRP254080 SRX8004777 \n",
"\n",
" experiment_title \\\n",
"0 \n",
"1 RNA-Seq of recombinant SARS-CoV-2: nucleotides... \n",
"2 RNA-Seq of recombinant SARS-CoV-2: SARS-CoV-mo... \n",
"3 RNA-Seq of recombinant SARS-CoV-2: GFP in ORF7a \n",
"4 RNA-Seq of recombinant SARS-CoV-2: nucleotides... \n",
"\n",
" experiment_desc organism_taxid \\\n",
"0 Metagenomics of Betacoronavirus 694002 \n",
"1 RNA-Seq of recombinant SARS-CoV-2: nucleotides... 2697049 \n",
"2 RNA-Seq of recombinant SARS-CoV-2: SARS-CoV-mo... 2697049 \n",
"3 RNA-Seq of recombinant SARS-CoV-2: GFP in ORF7a 2697049 \n",
"4 RNA-Seq of recombinant SARS-CoV-2: nucleotides... 2697049 \n",
"\n",
" organism_name library_strategy \\\n",
"0 Betacoronavirus AMPLICON \n",
"1 Severe acute respiratory syndrome coronavirus 2 RNA-Seq \n",
"2 Severe acute respiratory syndrome coronavirus 2 RNA-Seq \n",
"3 Severe acute respiratory syndrome coronavirus 2 RNA-Seq \n",
"4 Severe acute respiratory syndrome coronavirus 2 RNA-Seq \n",
"\n",
" library_source library_selection sample_accession sample_title \\\n",
"0 GENOMIC PCR ERS184553 \n",
"1 TRANSCRIPTOMIC RANDOM SRS6378608 \n",
"2 TRANSCRIPTOMIC RANDOM SRS6378607 \n",
"3 TRANSCRIPTOMIC RANDOM SRS6378606 \n",
"4 TRANSCRIPTOMIC RANDOM SRS6378605 \n",
"\n",
" instrument total_spots total_size run_accession \\\n",
"0 Illumina MiSeq 6555345 1248079055 ERR233433 \n",
"1 Illumina NovaSeq 6000 63592467 6219772044 SRR11426414 \n",
"2 Illumina NovaSeq 6000 65892888 6384922546 SRR11426415 \n",
"3 Illumina NovaSeq 6000 68805012 6711704333 SRR11426416 \n",
"4 Illumina NovaSeq 6000 72726462 7001420132 SRR11426417 \n",
"\n",
" run_total_spots run_total_bases \n",
"0 6555345 1966603500 \n",
"1 63592467 18753570814 \n",
"2 65892888 19482216314 \n",
"3 68805012 20332189030 \n",
"4 72726462 21378281198 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = db.sra_metadata(exp_ids)\n",
"df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment