Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Created December 10, 2020 16:49
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/da042e4d7853b781db0f52b4fd17c7ce to your computer and use it in GitHub Desktop.
Save afrendeiro/da042e4d7853b781db0f52b4fd17c7ce to your computer and use it in GitHub Desktop.
Get a GEO series matrix file describing an experiment and parse it into project level and sample level data.
#!/usr/bin/env python
"""
Get a GEO series matrix file describing an experiment and
parse it into project level and sample level data.
"""
import os
from typing import Tuple, Union
import tempfile
from collections import Counter
import urllib.request as request
from contextlib import closing
import gzip
import click
import pandas as pd
DataFrame = Union[pd.DataFrame]
@click.command()
@click.option(
"-u",
"--matrix-url",
help="FTP URL of gziped txt file with GEO series matrix.",
)
@click.option(
"-p", "--prefix", default=None, help="Prefix path to write files to."
)
def series_matrix2csv(
matrix_url: str, prefix: str = None
) -> Tuple[DataFrame, DataFrame]:
"""
Get a GEO series matrix file describing an experiment and
parse it into project level and sample level data.
Parameters
----------
matrix_url: str
FTP URL of gziped txt file with GEO series matrix.
prefix: str
Prefix path to write files to.
"""
with closing(request.urlopen(matrix_url)) as r:
content = gzip.decompress(r.fp.file.read())
lines = content.decode("utf-8").strip().split("\n")
# separate lines with only one field (project-related)
# from lines with >2 fields (sample-related)
# # if the same key appears more than once, keep all but rename
# # them with a suffix
prj_lines = dict()
sample_lines = dict()
idx_counts: Counter = Counter()
col_counts: Counter = Counter()
for line in lines:
cols = line.strip().split("\t")
key = cols[0].replace('"', "")
if len(cols) == 2:
if key in idx_counts:
key = f"{key}_{idx_counts[key] + 1}"
idx_counts[key] += 1
prj_lines[key] = cols[1].replace('"', "")
elif len(cols) > 2:
if key in col_counts:
key = f"{key}_{col_counts[key] + 1}"
col_counts[key] += 1
sample_lines[key] = [x.replace('"', "") for x in cols[1:]]
prj = pd.Series(prj_lines)
prj.index = prj.index.str.replace("!Series_", "")
samples = pd.DataFrame(sample_lines)
samples.columns = samples.columns.str.replace("!Sample_", "")
if prefix is not None:
prj.to_csv(os.path.join(prefix + ".project_annotation.csv"), index=True)
samples.to_csv(
os.path.join(prefix + ".sample_annotation.csv"), index=False
)
return prj, samples
if __name__ == "__main__":
series_matrix2csv()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment