bewt85/SequenceArchives.py

## SequenceArchives.py
import hashlib
import json
import logging
import os
import re
import requests
import subprocess
import sys
import time
import urllib

import pandas as pd
from collections import OrderedDict

logger = logging.getLogger(__name__)

def get_batches(df, count):
  for i in range(0, df.shape[0], count):
    yield df.iloc[i:i+count]

def get_chunks(g, size):
  chunk = list(islice(g, size))
  while chunk:
    yield chunk
    chunk = list(islice(g, size))

fields = OrderedDict([
  ('study_accession', 'Study ID'), #	study accession number
  # ('secondary_study_accession', 'secondary_study_accession'), #	secondary study accession number
  ('sample_accession', 'Sample ID'), #	sample accession number
  # ('secondary_sample_accession', 'secondary_sample_accession'), #	secondary sample accession number
  ('experiment_accession', 'Experiment ID'), #	experiment accession number
  ('run_accession', 'Run ID'), #	run accession number
  # ('submission_accession', 'submission_accession'), #	submission accession number
  ('tax_id', 'Submitted taxid'), #	taxonomic ID
  ('scientific_name', 'Submitted scientific name'), #	scientific name
  # ('instrument_platform', 'instrument_platform'), #	instrument platform used in sequencing experiment
  # ('instrument_model', 'instrument_model'), #	instrument model used in sequencing experiment
  # ('library_name', 'library_name'), #	sequencing library name
  ('center_name', 'Submitting center'), #	Submitting center
  ('first_public', 'Reads made public'), #	date when made public
  ('last_updated', 'Reads updated'), #	date when last updated
  ('experiment_title', 'Experiment'), #	brief experiment title
  ('study_title', 'Study'), #	brief sequencing study description
  # ('study_alias', 'study_alias'), #	submitter's name for the study
  # ('experiment_alias', 'experiment_alias'), #	submitter's name for the experiment
  # ('run_alias', 'run_alias'), #	submitter's name for the run
  # ('sample_alias', 'sample_alias'), #	submitter's name for the sample
  ('sample_title', 'Sample'), #	brief sample title
  ('first_created', 'Reads uploaded'), #	date when first created
  ('sample_description', 'Sample description'), #	detailed sample description
  ('strain', 'Submitted strain'), #	strain from which sample was obtained
  ('serovar', 'Submitted serovar'), #	serological variety of a species (usually a prokaryote) characterized by its antigenic properties
  # ('sex', 'sex'), #	sex of the organism from which the sample was obtained
  # ('submitted_sex', 'submitted_sex'), #	sex of the organism from which the sample was obtained
  # ('dev_stage', 'dev_stage'), #	sample obtained from an organism in a specific developmental stage
  # ('tissue_type', 'tissue_type'), #	tissue type from which the sample was obtained
  ('isolation_source', 'Isolation source'), #	describes the physical, environmental and/or local geographical source of the sample
  ('isolate', 'Isolate'), #	individual isolate from which sample was obtained
  # ('host_tax_id', 'Host taxid'), #	NCBI taxon id of the host
  ('host_scientific_name', 'Host'), #	Scientific name of the natural (as opposed to laboratory) host to the organism from which sample was obtained
  # ('host_common_name', 'host_common_name'), #	common name of the host
  ('host_status', 'Host status'), #	condition of host (eg. diseased or healthy)
  # ('host_sex', 'host_sex'), #	physical sex of the host
  # ('submitted_host_sex', 'submitted_host_sex'), #	physical sex of the host
  ('host_subject_id', 'Host ID'), #	a unique identifier by which each subject can be referred to, de-identified
  ('collection_date', 'Collection date'), #	date that the specimen was collected
  ('collected_by', 'Collected By'), #	name of the person who collected the specimen
  ('collecting_institute', 'Collecting institute'), #	Name of the institution to which the person collecting the specimen belongs. Format: Institute Name, Institute Address
  ('country', 'Country'), #	locality of sample isolation: country names, oceans or seas, followed by regions and localities
  ('region', 'Region'), #	geographical origin of the sample as defined by the specific region name followed by the locality name
  ('location', 'Location'), #	geographic location of isolation of the sample
  # ('environmental_sample', 'environmental_sample'), #	identifies sequences derived by direct molecular isolation from an environmental DNA sample
  # ('investigation_type', 'investigation_type'), #	the study type targeted by the sequencing
  ('receipt_date', 'Sample receipt data'), #	Date on which the sample was received
  # ('sampling_site', 'sampling_site'), #	the site/station where this sample was collection
])

def get_raw_metdata():
  s = requests.Session()
  LIMIT = 1000
  offset = 0
  projects = [
    'study_accession="PRJNA248792"'
  ]
  payload = {
    'result': 'read_run',
    'fields': '%2C'.join(fields.keys()),
    'limit': LIMIT,
    'dataPortal': 'pathogen',
    'format': 'json',
    'sortFields': 'run_accession'
  }

  for project in projects:
    offset = 0
    payload['query'] = urllib.parse.quote_plus(project)

    while True:
      payload['offset'] = str(offset)
      r = s.post(
        'https://www.ebi.ac.uk/ena/portal/api/search',
        data=payload
      )
      if r.status_code == 204:
        break
      r.raise_for_status()
      logger.debug(f"{len(r.json())} new results found at offset {offset}")
      for el in r.json():
        yield [el.get(f, '') for f in fields.keys()]
      offset += LIMIT
      time.sleep(1)

def set_dates(df):
  date_fields = [
    'collection_date', #	date that the specimen was collected
    'receipt_date', #	Date on which the sample was received
    'first_created', #	date when first created
    'first_public', #	date when made public
    'last_updated', #	date when last updated
  ]
  def date(row):
    for f in date_fields:
      try:
        if row[f]:
          (year, month, day) = row[f].split('-')
          return pd.Series([int(year), int(month), int(day), f])
      except:
        continue
    return pd.Series(['', '', '', 'missing'])
  df[['year', 'month', 'day', 'date_field']] = df.aggregate(date, axis=1)
  df.drop(columns=date_fields, inplace=True)

def collect_metadata():
  CHUNK_SIZE=500
  for chunk in get_chunks(get_raw_metdata(), CHUNK_SIZE):
    metadata = pd.DataFrame(columns=fields.keys(), data=chunk)
    metadata['displayname'] = metadata.aggregate(lambda r: r.get('isolate') if r.get('isolate') else r['run_accession'], axis=1)
    metadata['filename'] = metadata['run_accession'] + '.fa'
    metadata['Assembled by'] = 'NCBI'
    set_dates(metadata)
    yield metadata
	import hashlib
	import json
	import logging
	import os
	import re
	import requests
	import subprocess
	import sys
	import time
	import urllib

	import pandas as pd
	from collections import OrderedDict

	logger = logging.getLogger(__name__)

	def get_batches(df, count):
	for i in range(0, df.shape[0], count):
	yield df.iloc[i:i+count]

	def get_chunks(g, size):
	chunk = list(islice(g, size))
	while chunk:
	yield chunk
	chunk = list(islice(g, size))

	fields = OrderedDict([
	('study_accession', 'Study ID'), # study accession number
	# ('secondary_study_accession', 'secondary_study_accession'), # secondary study accession number
	('sample_accession', 'Sample ID'), # sample accession number
	# ('secondary_sample_accession', 'secondary_sample_accession'), # secondary sample accession number
	('experiment_accession', 'Experiment ID'), # experiment accession number
	('run_accession', 'Run ID'), # run accession number
	# ('submission_accession', 'submission_accession'), # submission accession number
	('tax_id', 'Submitted taxid'), # taxonomic ID
	('scientific_name', 'Submitted scientific name'), # scientific name
	# ('instrument_platform', 'instrument_platform'), # instrument platform used in sequencing experiment
	# ('instrument_model', 'instrument_model'), # instrument model used in sequencing experiment
	# ('library_name', 'library_name'), # sequencing library name
	('center_name', 'Submitting center'), # Submitting center
	('first_public', 'Reads made public'), # date when made public
	('last_updated', 'Reads updated'), # date when last updated
	('experiment_title', 'Experiment'), # brief experiment title
	('study_title', 'Study'), # brief sequencing study description
	# ('study_alias', 'study_alias'), # submitter's name for the study
	# ('experiment_alias', 'experiment_alias'), # submitter's name for the experiment
	# ('run_alias', 'run_alias'), # submitter's name for the run
	# ('sample_alias', 'sample_alias'), # submitter's name for the sample
	('sample_title', 'Sample'), # brief sample title
	('first_created', 'Reads uploaded'), # date when first created
	('sample_description', 'Sample description'), # detailed sample description
	('strain', 'Submitted strain'), # strain from which sample was obtained
	('serovar', 'Submitted serovar'), # serological variety of a species (usually a prokaryote) characterized by its antigenic properties
	# ('sex', 'sex'), # sex of the organism from which the sample was obtained
	# ('submitted_sex', 'submitted_sex'), # sex of the organism from which the sample was obtained
	# ('dev_stage', 'dev_stage'), # sample obtained from an organism in a specific developmental stage
	# ('tissue_type', 'tissue_type'), # tissue type from which the sample was obtained
	('isolation_source', 'Isolation source'), # describes the physical, environmental and/or local geographical source of the sample
	('isolate', 'Isolate'), # individual isolate from which sample was obtained
	# ('host_tax_id', 'Host taxid'), # NCBI taxon id of the host
	('host_scientific_name', 'Host'), # Scientific name of the natural (as opposed to laboratory) host to the organism from which sample was obtained
	# ('host_common_name', 'host_common_name'), # common name of the host
	('host_status', 'Host status'), # condition of host (eg. diseased or healthy)
	# ('host_sex', 'host_sex'), # physical sex of the host
	# ('submitted_host_sex', 'submitted_host_sex'), # physical sex of the host
	('host_subject_id', 'Host ID'), # a unique identifier by which each subject can be referred to, de-identified
	('collection_date', 'Collection date'), # date that the specimen was collected
	('collected_by', 'Collected By'), # name of the person who collected the specimen
	('collecting_institute', 'Collecting institute'), # Name of the institution to which the person collecting the specimen belongs. Format: Institute Name, Institute Address
	('country', 'Country'), # locality of sample isolation: country names, oceans or seas, followed by regions and localities
	('region', 'Region'), # geographical origin of the sample as defined by the specific region name followed by the locality name
	('location', 'Location'), # geographic location of isolation of the sample
	# ('environmental_sample', 'environmental_sample'), # identifies sequences derived by direct molecular isolation from an environmental DNA sample
	# ('investigation_type', 'investigation_type'), # the study type targeted by the sequencing
	('receipt_date', 'Sample receipt data'), # Date on which the sample was received
	# ('sampling_site', 'sampling_site'), # the site/station where this sample was collection
	])

	def get_raw_metdata():
	s = requests.Session()
	LIMIT = 1000
	offset = 0
	projects = [
	'study_accession="PRJNA248792"'
	]
	payload = {
	'result': 'read_run',
	'fields': '%2C'.join(fields.keys()),
	'limit': LIMIT,
	'dataPortal': 'pathogen',
	'format': 'json',
	'sortFields': 'run_accession'
	}

	for project in projects:
	offset = 0
	payload['query'] = urllib.parse.quote_plus(project)

	while True:
	payload['offset'] = str(offset)
	r = s.post(
	'https://www.ebi.ac.uk/ena/portal/api/search',
	data=payload
	)
	if r.status_code == 204:
	break
	r.raise_for_status()
	logger.debug(f"{len(r.json())} new results found at offset {offset}")
	for el in r.json():
	yield [el.get(f, '') for f in fields.keys()]
	offset += LIMIT
	time.sleep(1)

	def set_dates(df):
	date_fields = [
	'collection_date', # date that the specimen was collected
	'receipt_date', # Date on which the sample was received
	'first_created', # date when first created
	'first_public', # date when made public
	'last_updated', # date when last updated
	]
	def date(row):
	for f in date_fields:
	try:
	if row[f]:
	(year, month, day) = row[f].split('-')
	return pd.Series([int(year), int(month), int(day), f])
	except:
	continue
	return pd.Series(['', '', '', 'missing'])
	df[['year', 'month', 'day', 'date_field']] = df.aggregate(date, axis=1)
	df.drop(columns=date_fields, inplace=True)

	def collect_metadata():
	CHUNK_SIZE=500
	for chunk in get_chunks(get_raw_metdata(), CHUNK_SIZE):
	metadata = pd.DataFrame(columns=fields.keys(), data=chunk)
	metadata['displayname'] = metadata.aggregate(lambda r: r.get('isolate') if r.get('isolate') else r['run_accession'], axis=1)
	metadata['filename'] = metadata['run_accession'] + '.fa'
	metadata['Assembled by'] = 'NCBI'
	set_dates(metadata)
	yield metadata