Skip to content

Instantly share code, notes, and snippets.

@Miserlou
Created June 13, 2018 16:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Miserlou/98f3180c001ebe77febe3b1c55d054d7 to your computer and use it in GitHub Desktop.
Save Miserlou/98f3180c001ebe77febe3b1c55d054d7 to your computer and use it in GitHub Desktop.
gen_accession_list.py
##
# Hacked together, sorry..
# Required CSVs were tediously exported manually from GEO web interface.
##
import csv
import json
import requests
import pdb
import pprint
import GEOparse
import xmltodict
import logging
GEOparse.logger.setLevel(logging.getLevelName("WARN"))
AE_URL = "https://www.ebi.ac.uk/arrayexpress/ArrayExpress-Experiments.txt?array="
GEO_URL = "https://www.ncbi.nlm.nih.gov/gds?term=XXX[Related%20Series]&cmd=DetailsSearch&format=TEXT"
SRA_URL = "https://www.ncbi.nlm.nih.gov/sra?term=%22instrument%20illumina%20genome%20analyzer%22%5BProperties%5D"
MINIML_URL = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=XXX&targ=self&form=xml&view=quick"
all_accessions = {}
# First do microarray
with open('config/supported_microarray_platforms.csv', 'r') as csv_in:
csv_in = csv.reader(csv_in)
for ccount, row in enumerate(csv_in):
try:
if ccount is 0:
continue
accession = row[1]
# This is Array Express.
if '-' in accession:
url = AE_URL + accession
tsv_text = requests.get(url, timeout=60).text.encode("utf-8").splitlines()
tsv_in = csv.reader(tsv_text, delimiter='\t')
for count, tsv_row in enumerate(tsv_in):
if count is 0:
continue
ex_accesion = tsv_row[0]
num_assays = tsv_row[4]
all_accessions[ex_accesion] = int(num_assays)
else:
url = GEO_URL.replace('XXX', accession)
geo_text = requests.get(url, timeout=60).text.encode("utf-8")
geo_text = geo_text.split('<pre>\n')[1]
sections = geo_text.split('\n\n')
for section in sections:
num_samples = 0
ex_accession = None
for lcount, line in enumerate(section.split('\n')):
if lcount == 4:
num_samples = int(line.split(' ')[-2])
if lcount == 6:
ex_accession = line.split('Accession: ')[1].split('\t')[0]
all_accessions[ex_accession] = num_samples
except Exception as e:
# These are always empty search results
print(url)
print(e)
continue
# Then, do RNASeq
with open('config/supported_rnaseq_platforms.txt', 'r') as txt_file:
lines = txt_file.readlines()
for line in lines:
with open('accessions/' + line.strip() + '.csv', 'r') as accession_file:
csv_in = csv.reader(accession_file)
for ccount, row in enumerate(csv_in):
try:
if ccount is 0:
continue
srp = row[5]
current = all_accessions.get(srp, 0)
all_accessions[srp] = current + 1
except Exception as e:
print(e)
# Prefer GEO over AE
all_keys = all_accessions.keys()
for key in all_keys:
if "E-GEOD-" in key:
gse = key.replace('E-GEOD-', 'GSE')
if gse in all_keys:
print("Preferring " + gse + " over " + key)
del all_accessions[key]
# Load our GEO/GPL map
unique = []
with open('config/geo_gpl_map.json') as json_data:
geo_map = json.load(json_data)
for name, gpls in geo_map.items():
try:
for gpl in gpls:
g = GEOparse.get_GEO(gpl, destdir='/tmp')
serieses = g.metadata['series_id']
for series in serieses:
murl = MINIML_URL.replace('XXX', series)
miniml_text = requests.get(murl, timeout=60).text.encode("utf-8")
if '<Relation type="SRA"' in miniml_text:
# Not unique!
continue
else:
if series in all_accessions.keys():
continue
od = xmltodict.parse(miniml_text)
num_samples = len(od['MINiML']['Sample'])
print("Found unique!: " + series + " (" + str(num_samples) + ")")
all_accessions[series] = num_samples
except Exception as e:
print(e)
print sum(all_accessions.values())
with open('all_accessions.json', 'w') as fp:
json.dump(all_accessions, fp, indent=4)
pdb.set_trace()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment