numpde/affy_in_python

## affy_in_python
# RA, 2020-09-12

# A look at the GSE60880:
#   Human Lung Fibroblasts treated with TGFbeta, IL1, EGF and small molecule inhibitors of TGFBR1 and p38.
#   HLF cells were incubated with each agent for 0.5 hours, 1 hour, 2 hours or 8 hours in standard conditions
#   before total RNA was prepared and hybridised to Affymetrix microarrays.
#   https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60880

from tcga.utils import download
download = download.to(rel_path="download")

# Note: CDF for "Affymetrix Human Genome U133A 2.0 Array"
# can be obtained from
# http://www.affymetrix.com/support/technical/byproduct.affx?product=hgu133-20
# in "Human Genome U133A 2.0 Array (zip, 13 MB)"

# Note: CDF format is described at
# http://dept.stat.lsa.umich.edu/~kshedden/Courses/Stat545/Notes/AffxFileFormats/cdf.html
#

url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE60880&format=file"
import io
from pathlib import Path
import tempfile
import pandas as pd
import tarfile, gzip
from Bio.Affy import CelFile

with download(url).now.open(mode='rb') as fd:
    with tarfile.TarFile(fileobj=fd) as tf:
        for m in tf.getmembers():
            with tf.extractfile(m) as zf:
                with gzip.open(zf) as fd:
                    # cf = CelFile.read(io.StringIO(fd.read().decode()))
                    with tempfile.NamedTemporaryFile(mode='wb') as tmp:
                        tmp.write(fd.read())

                        from rpy2.robjects.packages import importr
                        from rpy2.robjects import r as R, pandas2ri, StrVector
                        importr('affy')
                        data = (R['ReadAffy'])(filenames=StrVector([tmp.name]), sampleNames=StrVector([Path(m.name).stem]))
                        eset = (R['expresso'])(data, bgcorrect_method="rma", normalize_method="quantiles", pmcorrect_method="pmonly", summary_method="medianpolish")
                        df = pandas2ri.rpy2py((R['as'])(eset, "data.frame")).T
                        print(df)

                        exit()

# Corresponding R code:
#
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
#
# BiocManager::install("affy")
# BiocManager::install("hgu133a2cdf")
#
# library("affy")
# browseVignettes("affy")
#
# data = ReadAffy(filenames=c("/home/ra/repos/cbb/20200912-TranscrResponse/download/wMs5HrzYcF7WviVIKryKpQK9AclIB6cC5vn_WeP9hM0=/data (1)/GSM1492784_GP866_3_209714002_92367.CEL"))
# data
#
# # http://127.0.0.1:29746/library/affy/doc/affy.pdf
# pm(data) # perfect match
# mm(data) # mismatch
# probeNames(data)   # affyID
# geneNames(data)
# sampleNames(data)
# featureNames(data)
#
# summaryAffyRNAdeg(AffyRNAdeg(data))
#
# eset = expresso(data, bgcorrect.method="rma",
#                 normalize.method="quantiles",
#                 pmcorrect.method="pmonly",
#                 summary.method="medianpolish")
#
# df = as(eset, "data.frame")
	# RA, 2020-09-12

	# A look at the GSE60880:
	# Human Lung Fibroblasts treated with TGFbeta, IL1, EGF and small molecule inhibitors of TGFBR1 and p38.
	# HLF cells were incubated with each agent for 0.5 hours, 1 hour, 2 hours or 8 hours in standard conditions
	# before total RNA was prepared and hybridised to Affymetrix microarrays.
	# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60880

	from tcga.utils import download
	download = download.to(rel_path="download")

	# Note: CDF for "Affymetrix Human Genome U133A 2.0 Array"
	# can be obtained from
	# http://www.affymetrix.com/support/technical/byproduct.affx?product=hgu133-20
	# in "Human Genome U133A 2.0 Array (zip, 13 MB)"

	# Note: CDF format is described at
	# http://dept.stat.lsa.umich.edu/~kshedden/Courses/Stat545/Notes/AffxFileFormats/cdf.html
	#

	url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE60880&format=file"
	import io
	from pathlib import Path
	import tempfile
	import pandas as pd
	import tarfile, gzip
	from Bio.Affy import CelFile

	with download(url).now.open(mode='rb') as fd:
	with tarfile.TarFile(fileobj=fd) as tf:
	for m in tf.getmembers():
	with tf.extractfile(m) as zf:
	with gzip.open(zf) as fd:
	# cf = CelFile.read(io.StringIO(fd.read().decode()))
	with tempfile.NamedTemporaryFile(mode='wb') as tmp:
	tmp.write(fd.read())

	from rpy2.robjects.packages import importr
	from rpy2.robjects import r as R, pandas2ri, StrVector
	importr('affy')
	data = (R['ReadAffy'])(filenames=StrVector([tmp.name]), sampleNames=StrVector([Path(m.name).stem]))
	eset = (R['expresso'])(data, bgcorrect_method="rma", normalize_method="quantiles", pmcorrect_method="pmonly", summary_method="medianpolish")
	df = pandas2ri.rpy2py((R['as'])(eset, "data.frame")).T
	print(df)

	exit()

	# Corresponding R code:
	#
	# if (!requireNamespace("BiocManager", quietly = TRUE))
	# install.packages("BiocManager")
	#
	# BiocManager::install("affy")
	# BiocManager::install("hgu133a2cdf")
	#
	# library("affy")
	# browseVignettes("affy")
	#
	# data = ReadAffy(filenames=c("/home/ra/repos/cbb/20200912-TranscrResponse/download/wMs5HrzYcF7WviVIKryKpQK9AclIB6cC5vn_WeP9hM0=/data (1)/GSM1492784_GP866_3_209714002_92367.CEL"))
	# data
	#
	# # http://127.0.0.1:29746/library/affy/doc/affy.pdf
	# pm(data) # perfect match
	# mm(data) # mismatch
	# probeNames(data) # affyID
	# geneNames(data)
	# sampleNames(data)
	# featureNames(data)
	#
	# summaryAffyRNAdeg(AffyRNAdeg(data))
	#
	# eset = expresso(data, bgcorrect.method="rma",
	# normalize.method="quantiles",
	# pmcorrect.method="pmonly",
	# summary.method="medianpolish")
	#
	# df = as(eset, "data.frame")