afrendeiro/series_matrix2csv.py

## series_matrix2csv.py
#!/usr/bin/env python

"""
Get a GEO series matrix file describing an experiment and
parse it into project level and sample level data.
"""

import os
from typing import Tuple, Union
import tempfile
from collections import Counter
import urllib.request as request
from contextlib import closing
import gzip

import click
import pandas as pd


DataFrame = Union[pd.DataFrame]


@click.command()
@click.option(
    "-u",
    "--matrix-url",
    help="FTP URL of gziped txt file with GEO series matrix.",
)
@click.option(
    "-p", "--prefix", default=None, help="Prefix path to write files to."
)
def series_matrix2csv(
    matrix_url: str, prefix: str = None
) -> Tuple[DataFrame, DataFrame]:
    """
    Get a GEO series matrix file describing an experiment and
    parse it into project level and sample level data.

    Parameters
    ----------
    matrix_url: str
        FTP URL of gziped txt file with GEO series matrix.
    prefix: str
        Prefix path to write files to.
    """
    with closing(request.urlopen(matrix_url)) as r:
        content = gzip.decompress(r.fp.file.read())
    lines = content.decode("utf-8").strip().split("\n")

    # separate lines with only one field (project-related)
    # from lines with >2 fields (sample-related)

    # # if the same key appears more than once, keep all but rename
    # # them with a suffix
    prj_lines = dict()
    sample_lines = dict()
    idx_counts: Counter = Counter()
    col_counts: Counter = Counter()

    for line in lines:
        cols = line.strip().split("\t")
        key = cols[0].replace('"', "")
        if len(cols) == 2:
            if key in idx_counts:
                key = f"{key}_{idx_counts[key] + 1}"
            idx_counts[key] += 1
            prj_lines[key] = cols[1].replace('"', "")
        elif len(cols) > 2:
            if key in col_counts:
                key = f"{key}_{col_counts[key] + 1}"
            col_counts[key] += 1
            sample_lines[key] = [x.replace('"', "") for x in cols[1:]]

    prj = pd.Series(prj_lines)
    prj.index = prj.index.str.replace("!Series_", "")

    samples = pd.DataFrame(sample_lines)
    samples.columns = samples.columns.str.replace("!Sample_", "")

    if prefix is not None:
        prj.to_csv(os.path.join(prefix + ".project_annotation.csv"), index=True)
        samples.to_csv(
            os.path.join(prefix + ".sample_annotation.csv"), index=False
        )

    return prj, samples


if __name__ == "__main__":
    series_matrix2csv()
	#!/usr/bin/env python

	"""
	Get a GEO series matrix file describing an experiment and
	parse it into project level and sample level data.
	"""

	import os
	from typing import Tuple, Union
	import tempfile
	from collections import Counter
	import urllib.request as request
	from contextlib import closing
	import gzip

	import click
	import pandas as pd


	DataFrame = Union[pd.DataFrame]


	@click.command()
	@click.option(
	"-u",
	"--matrix-url",
	help="FTP URL of gziped txt file with GEO series matrix.",
	)
	@click.option(
	"-p", "--prefix", default=None, help="Prefix path to write files to."
	)
	def series_matrix2csv(
	matrix_url: str, prefix: str = None
	) -> Tuple[DataFrame, DataFrame]:
	"""
	Get a GEO series matrix file describing an experiment and
	parse it into project level and sample level data.

	Parameters
	----------
	matrix_url: str
	FTP URL of gziped txt file with GEO series matrix.
	prefix: str
	Prefix path to write files to.
	"""
	with closing(request.urlopen(matrix_url)) as r:
	content = gzip.decompress(r.fp.file.read())
	lines = content.decode("utf-8").strip().split("\n")

	# separate lines with only one field (project-related)
	# from lines with >2 fields (sample-related)

	# # if the same key appears more than once, keep all but rename
	# # them with a suffix
	prj_lines = dict()
	sample_lines = dict()
	idx_counts: Counter = Counter()
	col_counts: Counter = Counter()

	for line in lines:
	cols = line.strip().split("\t")
	key = cols[0].replace('"', "")
	if len(cols) == 2:
	if key in idx_counts:
	key = f"{key}_{idx_counts[key] + 1}"
	idx_counts[key] += 1
	prj_lines[key] = cols[1].replace('"', "")
	elif len(cols) > 2:
	if key in col_counts:
	key = f"{key}_{col_counts[key] + 1}"
	col_counts[key] += 1
	sample_lines[key] = [x.replace('"', "") for x in cols[1:]]

	prj = pd.Series(prj_lines)
	prj.index = prj.index.str.replace("!Series_", "")

	samples = pd.DataFrame(sample_lines)
	samples.columns = samples.columns.str.replace("!Sample_", "")

	if prefix is not None:
	prj.to_csv(os.path.join(prefix + ".project_annotation.csv"), index=True)
	samples.to_csv(
	os.path.join(prefix + ".sample_annotation.csv"), index=False
	)

	return prj, samples


	if __name__ == "__main__":
	series_matrix2csv()