Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Created May 26, 2017 17:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save afrendeiro/27d6d0f7f7f1766ade90407d78665da0 to your computer and use it in GitHub Desktop.
Save afrendeiro/27d6d0f7f7f1766ade90407d78665da0 to your computer and use it in GitHub Desktop.
GEO Series matrix to project and sample annotations
def series_matrix2csv(matrix_url, prefix=None):
"""
matrix_url: gziped URL with GEO series matrix.
"""
import gzip
import pandas as pd
os.system("wget {}".format(matrix_url))
filename = matrix_url.split("/")[-1]
with gzip.open(filename, 'rb') as f:
file_content = f.read()
# separate lines with only one field (project-related)
# from lines with >2 fields (sample-related)
prj_lines = dict()
sample_lines = dict()
for line in file_content.decode("utf-8").strip().split("\n"):
line = line.strip().split("\t")
if len(line) == 2:
prj_lines[line[0].replace("\"", "")] = line[1].replace("\"", "")
elif len(line) > 2:
sample_lines[line[0].replace("\"", "")] = [x.replace("\"", "") for x in line[1:]]
prj = pd.Series(prj_lines)
prj.index = prj.index.str.replace("!Series_", "")
samples = pd.DataFrame(sample_lines)
samples.columns = samples.columns.str.replace("!Sample_", "")
if prefix is not None:
prj.to_csv(os.path.join(prefix + ".project_annotation.csv"), index=True)
samples.to_csv(os.path.join(prefix + ".sample_annotation.csv"), index=False)
return prj, samples
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment