I have reused the code enough to make a package out of it.
The python package is deposited at Github. with an example in Jupyter notebook.
import pandas as pd | |
import rpy2.robjects as robjects | |
from rpy2.robjects import pandas2ri, Formula | |
pandas2ri.activate() | |
from rpy2.robjects.packages import importr | |
deseq = importr('DESeq2') | |
''' | |
Adopted from: https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2 | |
''' | |
to_dataframe = robjects.r('function(x) data.frame(x)') | |
class py_DESeq2: | |
''' | |
DESeq2 object through rpy2 | |
input: | |
count_matrix: should be a pandas dataframe with each column as count, and a id column for gene id | |
example: | |
id sampleA sampleB | |
geneA 5 1 | |
geneB 4 5 | |
geneC 1 2 | |
design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames | |
treatment | |
sampleA1 A | |
sampleA2 A | |
sampleB1 B | |
sampleB2 B | |
design_formula: see DESeq2 manual, example: "~ treatment"" | |
gene_column: column name of gene id columns, exmplae "id" | |
''' | |
def __init__(self, count_matrix, design_matrix, design_formula, gene_column='id'): | |
try: | |
assert gene_column in count_matrix.columns, 'Wrong gene id column name' | |
gene_id = count_matrix[gene_column] | |
except AttributeError: | |
sys.exit('Wrong Pandas dataframe?') | |
self.dds = None | |
self.deseq_result = None | |
self.resLFC = None | |
self.comparison = None | |
self.normalized_count_matrix = None | |
self.gene_column = gene_column | |
self.gene_id = count_matrix[self.gene_column] | |
self.count_matrix = pandas2ri.py2ri(count_matrix.drop(gene_column,axis=1)) | |
self.design_matrix = pandas2ri.py2ri(design_matrix) | |
self.design_formula = Formula(design_formula) | |
def run_deseq(self, **kwargs): | |
self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix, | |
colData=self.design_matrix, | |
design=self.design_formula) | |
self.dds = deseq.DESeq(self.dds, **kwargs) | |
self.normalized_count_matrix = deseq.counts(self.dds, normalized=True) | |
def get_deseq_result(self, **kwargs): | |
self.comparison = deseq.resultsNames(self.dds) | |
self.deseq_result = deseq.results(self.dds, **kwargs) | |
self.deseq_result = to_dataframe(self.deseq_result) | |
self.deseq_result = pandas2ri.ri2py(self.deseq_result) ## back to pandas dataframe | |
self.deseq_result[self.gene_column] = self.gene_id.values |
from __future__ import print_function | |
import pandas as pd | |
import rpy2.robjects as robjects | |
from rpy2.robjects import pandas2ri, Formula | |
pandas2ri.activate() | |
from rpy2.robjects.packages import importr | |
dexseq = importr('DEXSeq') | |
bp = importr('BiocParallel') | |
''' | |
Adopted from: https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2 | |
''' | |
to_dataframe = robjects.r('function(x) data.frame(x)') | |
class py_DEXSeq: | |
''' | |
DEXSeq2 object through rpy2 | |
input: | |
count_matrix: should be a pandas dataframe with each column as count, and a id column for exon id | |
example: | |
exonID sampleA sampleB | |
geneA 5 1 | |
geneB 4 5 | |
geneC 1 2 | |
design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames | |
treatment | |
sampleA1 A | |
sampleA2 A | |
sampleB1 B | |
sampleB2 B | |
design_formula: see DEXSeq manual, example: "~ sample + exon + exon:treatment"" | |
feature_column: column name of exon id columns, example "id" | |
var_column: will pass to fitExpToVar for DEXSeq exon fold change | |
exons: exon id | |
genes: gene id for dexseq grouping | |
threads: number of threads to use | |
''' | |
def __init__(self, count_matrix, design_matrix, design_formula, | |
feature_column='id', var_column = 'condition', | |
exons=None, genes=None, threads=1): | |
try: | |
assert feature_column in count_matrix.columns, 'Wrong gene id column name' | |
assert var_column in design_matrix.columns, 'Wrong var column for DEXSeq' | |
except AttributeError: | |
sys.exit('Wrong Pandas dataframe?') | |
self.dxd = None | |
self.dxd_res = None | |
self.dexseq_result = None | |
self.comparison = None | |
self.normalized_count_matrix = None | |
self.feature_column = feature_column | |
self.exons = exons | |
self.genes = genes | |
self.gene_id = count_matrix[self.feature_column] | |
self.count_matrix = pandas2ri.py2ri(count_matrix.drop(feature_column,axis=1)) | |
self.design_matrix = pandas2ri.py2ri(design_matrix) | |
self.design_formula = Formula(design_formula) | |
self.BPPARAM = bp.MulticoreParam(workers=threads) | |
self.var_column = var_column | |
def run_dexseq(self, **kwargs): | |
self.dxd = dexseq.DEXSeqDataSet(countData=self.count_matrix, | |
sampleData=self.design_matrix, | |
design=self.design_formula, | |
featureID = self.exons, | |
groupID = self.genes) | |
print('Constructed DXD object') | |
self.dxd = dexseq.estimateSizeFactors_DEXSeqDataSet(self.dxd) | |
self.dxd = dexseq.estimateDispersions_DEXSeqDataSet(self.dxd, BPPARAM=self.BPPARAM) | |
print('Starting DEXSeq test') | |
self.dxd = dexseq.testForDEU(self.dxd, BPPARAM=self.BPPARAM) | |
self.dxd = dexseq.estimateExonFoldChanges(self.dxd, | |
fitExpToVar=self.var_column, | |
BPPARAM=self.BPPARAM) | |
print('Finished DEXSeq fold change') | |
def get_dexseq_result(self, **kwargs): | |
self.dexseq_result = to_dataframe(dexseq.DEXSeqResults(self.dxd), **kwargs) | |
self.dexseq_result = pandas2ri.ri2py(self.dexseq_result) ## back to pandas dataframe | |
self.dexseq_result['exons'] = self.exons | |
self.dexseq_result['genes'] = self.genes | |
self.dexseq_result.drop('genomicData', axis=1, inplace=True) | |
def normalized_count(self): | |
self.normalized_count_matrix = dexseq.counts(self.dxd,normalized=True) | |
return normalized_count_matrix |
Hi @borisstojilkovic, I'm sorry it didn't work for you out-of-the-box! I'm not exactly sure what the problem is from the error message, but I did spot some differences between the gist version and the current implementation here that eliminate the use of resultsNames
in get_deseq_result
. The gist version here is a little out-of-date, maybe go ahead and use the maintained code in diffexpr would help! please let me know if that works, I'd be interested whether it works on a windows machine!
Hi, I tried to run your code (deseq) on example data, but it seems that I am getting an error. Do you know how to solve this?