Created
March 27, 2018 03:27
-
-
Save cplaisier/bf92245449d42d59cc0d68da455d3a56 to your computer and use it in GitHub Desktop.
Example of use GEOParse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import GEOparse | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import numpy | |
from matplotlib.backends.backend_pdf import PdfPages | |
#gseNums = ['GSE14860'] | |
#gseNums = ['GSE49278', 'GSE19750', 'GSE10846'] | |
gseNums = ['GSE19417', 'GSE49278', 'GSE19750', 'GSE10846', 'GSE39582', 'GSE35158', 'GSE19422', 'GSE19987', 'GSE32894', 'GSE35158', 'GSE27155', 'GSE33630', 'GSE56303', 'GSE29695', 'GSE10141', 'GSE9843', 'GSE25097', 'GSE32225', 'GSE26566', 'GSE65858', 'GSE39366', 'GSE22138', 'GSE46517', 'GSE71729', 'GSE8607', 'GSE4573', 'GSE29354', 'GSE71118', 'GSE19949', 'GSE15641', 'GSE26253', 'GSE15460', 'GSE21034', 'GSE9891', 'GSE32062', 'GSE72094', 'GSE26939', 'GSE29174', 'GSE31448', 'GSE45725', 'GSE40435', 'GSE14860'] | |
with PdfPages('GSE_boxplots.pdf') as pdf: | |
for gse1 in gseNums: | |
gse = GEOparse.get_GEO(geo=gse1, destdir="./downloads") | |
# Get expression data and metadata matrices | |
exprs = [] | |
gsmNames = [] | |
metadata = {} | |
for gsm_name, gsm in gse.gsms.items(): | |
if gsm.metadata['type'][0]=='RNA': | |
# Expression data | |
if len(gsm.table)>0: | |
tmp = gsm.table['VALUE'] | |
tmp.index = gsm.table['ID_REF'] | |
gsmNames.append(gsm_name) | |
if len(exprs)==0: | |
exprs = tmp.to_frame() | |
else: | |
exprs = pd.concat([exprs,tmp.to_frame()],axis=1) | |
# Metadata | |
for key,value in gsm.metadata.items(): | |
#print(key) | |
#print(value) | |
if (key=='characteristics_ch1' or key=='characteristics_ch2') and (len([i for i in value if i!=''])>1 or value[0].find(': ')!=-1): | |
#print(value) | |
tmpVal = 0 | |
for tmp in value: | |
splitUp = [i.strip() for i in tmp.split(':')] | |
#print(splitUp) | |
if len(splitUp)==2: | |
if not splitUp[0] in metadata: | |
metadata[splitUp[0]] = {} | |
metadata[splitUp[0]][gsm_name] = splitUp[1] | |
else: | |
if not key in metadata: | |
metadata[key] = {} | |
metadata[key][gsm_name] = splitUp[0] | |
else: | |
if not key in metadata: | |
metadata[key] = {} | |
if len(value)==1: | |
metadata[key][gsm_name] = ' '.join([j.replace(',',' ') for j in value]) | |
# Write expression data matrix to file | |
exprs.columns = gsmNames | |
with open('exprs/'+gse1+'_exprs.csv','w') as outFile: | |
exprs.to_csv(outFile) | |
# Write metadata matrix to file | |
with open('pData/'+gse1+'_pData.csv','w') as outFile: | |
outFile.write('Metadata,'+','.join(gsmNames)) | |
for key in metadata: | |
tmp = [key] | |
for gsm_name in gsmNames: | |
if gsm_name in metadata[key]: | |
tmp.append(metadata[key][gsm_name]) | |
else: | |
tmp.append('NA') | |
outFile.write('\n'+','.join(tmp)) | |
# Plot boxplot of expression data | |
plt.boxplot(exprs.transpose(),showfliers=False) | |
plt.title(gse1) | |
pdf.savefig() | |
plt.close() | |
try: | |
plt.boxplot(numpy.log2(exprs).transpose(),showfliers=False) | |
plt.title('log2('+gse1+')') | |
pdf.savefig() | |
plt.close() | |
except: | |
pass | |
# Write out platform information | |
for gpl_name, gpl in gse.gpls.items(): | |
with open('annot/'+gse1+'_'+gpl_name+'_gpl.csv','w') as outFile: | |
gpl.table.to_csv(outFile) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment