Created
June 13, 2018 16:32
-
-
Save Miserlou/98f3180c001ebe77febe3b1c55d054d7 to your computer and use it in GitHub Desktop.
gen_accession_list.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## | |
# Hacked together, sorry.. | |
# Required CSVs were tediously exported manually from GEO web interface. | |
## | |
import csv | |
import json | |
import requests | |
import pdb | |
import pprint | |
import GEOparse | |
import xmltodict | |
import logging | |
GEOparse.logger.setLevel(logging.getLevelName("WARN")) | |
AE_URL = "https://www.ebi.ac.uk/arrayexpress/ArrayExpress-Experiments.txt?array=" | |
GEO_URL = "https://www.ncbi.nlm.nih.gov/gds?term=XXX[Related%20Series]&cmd=DetailsSearch&format=TEXT" | |
SRA_URL = "https://www.ncbi.nlm.nih.gov/sra?term=%22instrument%20illumina%20genome%20analyzer%22%5BProperties%5D" | |
MINIML_URL = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=XXX&targ=self&form=xml&view=quick" | |
all_accessions = {} | |
# First do microarray | |
with open('config/supported_microarray_platforms.csv', 'r') as csv_in: | |
csv_in = csv.reader(csv_in) | |
for ccount, row in enumerate(csv_in): | |
try: | |
if ccount is 0: | |
continue | |
accession = row[1] | |
# This is Array Express. | |
if '-' in accession: | |
url = AE_URL + accession | |
tsv_text = requests.get(url, timeout=60).text.encode("utf-8").splitlines() | |
tsv_in = csv.reader(tsv_text, delimiter='\t') | |
for count, tsv_row in enumerate(tsv_in): | |
if count is 0: | |
continue | |
ex_accesion = tsv_row[0] | |
num_assays = tsv_row[4] | |
all_accessions[ex_accesion] = int(num_assays) | |
else: | |
url = GEO_URL.replace('XXX', accession) | |
geo_text = requests.get(url, timeout=60).text.encode("utf-8") | |
geo_text = geo_text.split('<pre>\n')[1] | |
sections = geo_text.split('\n\n') | |
for section in sections: | |
num_samples = 0 | |
ex_accession = None | |
for lcount, line in enumerate(section.split('\n')): | |
if lcount == 4: | |
num_samples = int(line.split(' ')[-2]) | |
if lcount == 6: | |
ex_accession = line.split('Accession: ')[1].split('\t')[0] | |
all_accessions[ex_accession] = num_samples | |
except Exception as e: | |
# These are always empty search results | |
print(url) | |
print(e) | |
continue | |
# Then, do RNASeq | |
with open('config/supported_rnaseq_platforms.txt', 'r') as txt_file: | |
lines = txt_file.readlines() | |
for line in lines: | |
with open('accessions/' + line.strip() + '.csv', 'r') as accession_file: | |
csv_in = csv.reader(accession_file) | |
for ccount, row in enumerate(csv_in): | |
try: | |
if ccount is 0: | |
continue | |
srp = row[5] | |
current = all_accessions.get(srp, 0) | |
all_accessions[srp] = current + 1 | |
except Exception as e: | |
print(e) | |
# Prefer GEO over AE | |
all_keys = all_accessions.keys() | |
for key in all_keys: | |
if "E-GEOD-" in key: | |
gse = key.replace('E-GEOD-', 'GSE') | |
if gse in all_keys: | |
print("Preferring " + gse + " over " + key) | |
del all_accessions[key] | |
# Load our GEO/GPL map | |
unique = [] | |
with open('config/geo_gpl_map.json') as json_data: | |
geo_map = json.load(json_data) | |
for name, gpls in geo_map.items(): | |
try: | |
for gpl in gpls: | |
g = GEOparse.get_GEO(gpl, destdir='/tmp') | |
serieses = g.metadata['series_id'] | |
for series in serieses: | |
murl = MINIML_URL.replace('XXX', series) | |
miniml_text = requests.get(murl, timeout=60).text.encode("utf-8") | |
if '<Relation type="SRA"' in miniml_text: | |
# Not unique! | |
continue | |
else: | |
if series in all_accessions.keys(): | |
continue | |
od = xmltodict.parse(miniml_text) | |
num_samples = len(od['MINiML']['Sample']) | |
print("Found unique!: " + series + " (" + str(num_samples) + ")") | |
all_accessions[series] = num_samples | |
except Exception as e: | |
print(e) | |
print sum(all_accessions.values()) | |
with open('all_accessions.json', 'w') as fp: | |
json.dump(all_accessions, fp, indent=4) | |
pdb.set_trace() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment