Skip to content

Instantly share code, notes, and snippets.

@cplaisier
Created March 27, 2018 03:27
Show Gist options
  • Save cplaisier/bf92245449d42d59cc0d68da455d3a56 to your computer and use it in GitHub Desktop.
Save cplaisier/bf92245449d42d59cc0d68da455d3a56 to your computer and use it in GitHub Desktop.
Example of use GEOParse
import GEOparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy
from matplotlib.backends.backend_pdf import PdfPages
#gseNums = ['GSE14860']
#gseNums = ['GSE49278', 'GSE19750', 'GSE10846']
gseNums = ['GSE19417', 'GSE49278', 'GSE19750', 'GSE10846', 'GSE39582', 'GSE35158', 'GSE19422', 'GSE19987', 'GSE32894', 'GSE35158', 'GSE27155', 'GSE33630', 'GSE56303', 'GSE29695', 'GSE10141', 'GSE9843', 'GSE25097', 'GSE32225', 'GSE26566', 'GSE65858', 'GSE39366', 'GSE22138', 'GSE46517', 'GSE71729', 'GSE8607', 'GSE4573', 'GSE29354', 'GSE71118', 'GSE19949', 'GSE15641', 'GSE26253', 'GSE15460', 'GSE21034', 'GSE9891', 'GSE32062', 'GSE72094', 'GSE26939', 'GSE29174', 'GSE31448', 'GSE45725', 'GSE40435', 'GSE14860']
with PdfPages('GSE_boxplots.pdf') as pdf:
for gse1 in gseNums:
gse = GEOparse.get_GEO(geo=gse1, destdir="./downloads")
# Get expression data and metadata matrices
exprs = []
gsmNames = []
metadata = {}
for gsm_name, gsm in gse.gsms.items():
if gsm.metadata['type'][0]=='RNA':
# Expression data
if len(gsm.table)>0:
tmp = gsm.table['VALUE']
tmp.index = gsm.table['ID_REF']
gsmNames.append(gsm_name)
if len(exprs)==0:
exprs = tmp.to_frame()
else:
exprs = pd.concat([exprs,tmp.to_frame()],axis=1)
# Metadata
for key,value in gsm.metadata.items():
#print(key)
#print(value)
if (key=='characteristics_ch1' or key=='characteristics_ch2') and (len([i for i in value if i!=''])>1 or value[0].find(': ')!=-1):
#print(value)
tmpVal = 0
for tmp in value:
splitUp = [i.strip() for i in tmp.split(':')]
#print(splitUp)
if len(splitUp)==2:
if not splitUp[0] in metadata:
metadata[splitUp[0]] = {}
metadata[splitUp[0]][gsm_name] = splitUp[1]
else:
if not key in metadata:
metadata[key] = {}
metadata[key][gsm_name] = splitUp[0]
else:
if not key in metadata:
metadata[key] = {}
if len(value)==1:
metadata[key][gsm_name] = ' '.join([j.replace(',',' ') for j in value])
# Write expression data matrix to file
exprs.columns = gsmNames
with open('exprs/'+gse1+'_exprs.csv','w') as outFile:
exprs.to_csv(outFile)
# Write metadata matrix to file
with open('pData/'+gse1+'_pData.csv','w') as outFile:
outFile.write('Metadata,'+','.join(gsmNames))
for key in metadata:
tmp = [key]
for gsm_name in gsmNames:
if gsm_name in metadata[key]:
tmp.append(metadata[key][gsm_name])
else:
tmp.append('NA')
outFile.write('\n'+','.join(tmp))
# Plot boxplot of expression data
plt.boxplot(exprs.transpose(),showfliers=False)
plt.title(gse1)
pdf.savefig()
plt.close()
try:
plt.boxplot(numpy.log2(exprs).transpose(),showfliers=False)
plt.title('log2('+gse1+')')
pdf.savefig()
plt.close()
except:
pass
# Write out platform information
for gpl_name, gpl in gse.gpls.items():
with open('annot/'+gse1+'_'+gpl_name+'_gpl.csv','w') as outFile:
gpl.table.to_csv(outFile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment