Skip to content

Instantly share code, notes, and snippets.

@bewt85
Created March 7, 2019 18:04
Show Gist options
  • Save bewt85/a749a30c7d46ed7c2995eafc094bc10f to your computer and use it in GitHub Desktop.
Save bewt85/a749a30c7d46ed7c2995eafc094bc10f to your computer and use it in GitHub Desktop.
Some code from a project downloading data from ENA
import hashlib
import json
import logging
import os
import re
import requests
import subprocess
import sys
import time
import urllib
import pandas as pd
from collections import OrderedDict
logger = logging.getLogger(__name__)
def get_batches(df, count):
for i in range(0, df.shape[0], count):
yield df.iloc[i:i+count]
def get_chunks(g, size):
chunk = list(islice(g, size))
while chunk:
yield chunk
chunk = list(islice(g, size))
fields = OrderedDict([
('study_accession', 'Study ID'), # study accession number
# ('secondary_study_accession', 'secondary_study_accession'), # secondary study accession number
('sample_accession', 'Sample ID'), # sample accession number
# ('secondary_sample_accession', 'secondary_sample_accession'), # secondary sample accession number
('experiment_accession', 'Experiment ID'), # experiment accession number
('run_accession', 'Run ID'), # run accession number
# ('submission_accession', 'submission_accession'), # submission accession number
('tax_id', 'Submitted taxid'), # taxonomic ID
('scientific_name', 'Submitted scientific name'), # scientific name
# ('instrument_platform', 'instrument_platform'), # instrument platform used in sequencing experiment
# ('instrument_model', 'instrument_model'), # instrument model used in sequencing experiment
# ('library_name', 'library_name'), # sequencing library name
('center_name', 'Submitting center'), # Submitting center
('first_public', 'Reads made public'), # date when made public
('last_updated', 'Reads updated'), # date when last updated
('experiment_title', 'Experiment'), # brief experiment title
('study_title', 'Study'), # brief sequencing study description
# ('study_alias', 'study_alias'), # submitter's name for the study
# ('experiment_alias', 'experiment_alias'), # submitter's name for the experiment
# ('run_alias', 'run_alias'), # submitter's name for the run
# ('sample_alias', 'sample_alias'), # submitter's name for the sample
('sample_title', 'Sample'), # brief sample title
('first_created', 'Reads uploaded'), # date when first created
('sample_description', 'Sample description'), # detailed sample description
('strain', 'Submitted strain'), # strain from which sample was obtained
('serovar', 'Submitted serovar'), # serological variety of a species (usually a prokaryote) characterized by its antigenic properties
# ('sex', 'sex'), # sex of the organism from which the sample was obtained
# ('submitted_sex', 'submitted_sex'), # sex of the organism from which the sample was obtained
# ('dev_stage', 'dev_stage'), # sample obtained from an organism in a specific developmental stage
# ('tissue_type', 'tissue_type'), # tissue type from which the sample was obtained
('isolation_source', 'Isolation source'), # describes the physical, environmental and/or local geographical source of the sample
('isolate', 'Isolate'), # individual isolate from which sample was obtained
# ('host_tax_id', 'Host taxid'), # NCBI taxon id of the host
('host_scientific_name', 'Host'), # Scientific name of the natural (as opposed to laboratory) host to the organism from which sample was obtained
# ('host_common_name', 'host_common_name'), # common name of the host
('host_status', 'Host status'), # condition of host (eg. diseased or healthy)
# ('host_sex', 'host_sex'), # physical sex of the host
# ('submitted_host_sex', 'submitted_host_sex'), # physical sex of the host
('host_subject_id', 'Host ID'), # a unique identifier by which each subject can be referred to, de-identified
('collection_date', 'Collection date'), # date that the specimen was collected
('collected_by', 'Collected By'), # name of the person who collected the specimen
('collecting_institute', 'Collecting institute'), # Name of the institution to which the person collecting the specimen belongs. Format: Institute Name, Institute Address
('country', 'Country'), # locality of sample isolation: country names, oceans or seas, followed by regions and localities
('region', 'Region'), # geographical origin of the sample as defined by the specific region name followed by the locality name
('location', 'Location'), # geographic location of isolation of the sample
# ('environmental_sample', 'environmental_sample'), # identifies sequences derived by direct molecular isolation from an environmental DNA sample
# ('investigation_type', 'investigation_type'), # the study type targeted by the sequencing
('receipt_date', 'Sample receipt data'), # Date on which the sample was received
# ('sampling_site', 'sampling_site'), # the site/station where this sample was collection
])
def get_raw_metdata():
s = requests.Session()
LIMIT = 1000
offset = 0
projects = [
'study_accession="PRJNA248792"'
]
payload = {
'result': 'read_run',
'fields': '%2C'.join(fields.keys()),
'limit': LIMIT,
'dataPortal': 'pathogen',
'format': 'json',
'sortFields': 'run_accession'
}
for project in projects:
offset = 0
payload['query'] = urllib.parse.quote_plus(project)
while True:
payload['offset'] = str(offset)
r = s.post(
'https://www.ebi.ac.uk/ena/portal/api/search',
data=payload
)
if r.status_code == 204:
break
r.raise_for_status()
logger.debug(f"{len(r.json())} new results found at offset {offset}")
for el in r.json():
yield [el.get(f, '') for f in fields.keys()]
offset += LIMIT
time.sleep(1)
def set_dates(df):
date_fields = [
'collection_date', # date that the specimen was collected
'receipt_date', # Date on which the sample was received
'first_created', # date when first created
'first_public', # date when made public
'last_updated', # date when last updated
]
def date(row):
for f in date_fields:
try:
if row[f]:
(year, month, day) = row[f].split('-')
return pd.Series([int(year), int(month), int(day), f])
except:
continue
return pd.Series(['', '', '', 'missing'])
df[['year', 'month', 'day', 'date_field']] = df.aggregate(date, axis=1)
df.drop(columns=date_fields, inplace=True)
def collect_metadata():
CHUNK_SIZE=500
for chunk in get_chunks(get_raw_metdata(), CHUNK_SIZE):
metadata = pd.DataFrame(columns=fields.keys(), data=chunk)
metadata['displayname'] = metadata.aggregate(lambda r: r.get('isolate') if r.get('isolate') else r['run_accession'], axis=1)
metadata['filename'] = metadata['run_accession'] + '.fa'
metadata['Assembled by'] = 'NCBI'
set_dates(metadata)
yield metadata
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment