Skip to content

Instantly share code, notes, and snippets.

@wckdouglas
Last active May 14, 2023 16:44
Show Gist options
  • Star 26 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save wckdouglas/3f8fb27a3d7a1eb24c598aa04f70fb25 to your computer and use it in GitHub Desktop.
Save wckdouglas/3f8fb27a3d7a1eb24c598aa04f70fb25 to your computer and use it in GitHub Desktop.
Running deseq2 in python

I have reused the code enough to make a package out of it.

The python package is deposited at Github. with an example in Jupyter notebook.

⚠️ this gist is out-of-date, please see the diffexpr package for a maintained version of the code

import pandas as pd
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri, Formula
pandas2ri.activate()
from rpy2.robjects.packages import importr
deseq = importr('DESeq2')
'''
Adopted from: https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2
'''
to_dataframe = robjects.r('function(x) data.frame(x)')
class py_DESeq2:
'''
DESeq2 object through rpy2
input:
count_matrix: should be a pandas dataframe with each column as count, and a id column for gene id
example:
id sampleA sampleB
geneA 5 1
geneB 4 5
geneC 1 2
design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames
treatment
sampleA1 A
sampleA2 A
sampleB1 B
sampleB2 B
design_formula: see DESeq2 manual, example: "~ treatment""
gene_column: column name of gene id columns, exmplae "id"
'''
def __init__(self, count_matrix, design_matrix, design_formula, gene_column='id'):
try:
assert gene_column in count_matrix.columns, 'Wrong gene id column name'
gene_id = count_matrix[gene_column]
except AttributeError:
sys.exit('Wrong Pandas dataframe?')
self.dds = None
self.deseq_result = None
self.resLFC = None
self.comparison = None
self.normalized_count_matrix = None
self.gene_column = gene_column
self.gene_id = count_matrix[self.gene_column]
self.count_matrix = pandas2ri.py2ri(count_matrix.drop(gene_column,axis=1))
self.design_matrix = pandas2ri.py2ri(design_matrix)
self.design_formula = Formula(design_formula)
def run_deseq(self, **kwargs):
self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix,
colData=self.design_matrix,
design=self.design_formula)
self.dds = deseq.DESeq(self.dds, **kwargs)
self.normalized_count_matrix = deseq.counts(self.dds, normalized=True)
def get_deseq_result(self, **kwargs):
self.comparison = deseq.resultsNames(self.dds)
self.deseq_result = deseq.results(self.dds, **kwargs)
self.deseq_result = to_dataframe(self.deseq_result)
self.deseq_result = pandas2ri.ri2py(self.deseq_result) ## back to pandas dataframe
self.deseq_result[self.gene_column] = self.gene_id.values
from __future__ import print_function
import pandas as pd
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri, Formula
pandas2ri.activate()
from rpy2.robjects.packages import importr
dexseq = importr('DEXSeq')
bp = importr('BiocParallel')
'''
Adopted from: https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2
'''
to_dataframe = robjects.r('function(x) data.frame(x)')
class py_DEXSeq:
'''
DEXSeq2 object through rpy2
input:
count_matrix: should be a pandas dataframe with each column as count, and a id column for exon id
example:
exonID sampleA sampleB
geneA 5 1
geneB 4 5
geneC 1 2
design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames
treatment
sampleA1 A
sampleA2 A
sampleB1 B
sampleB2 B
design_formula: see DEXSeq manual, example: "~ sample + exon + exon:treatment""
feature_column: column name of exon id columns, example "id"
var_column: will pass to fitExpToVar for DEXSeq exon fold change
exons: exon id
genes: gene id for dexseq grouping
threads: number of threads to use
'''
def __init__(self, count_matrix, design_matrix, design_formula,
feature_column='id', var_column = 'condition',
exons=None, genes=None, threads=1):
try:
assert feature_column in count_matrix.columns, 'Wrong gene id column name'
assert var_column in design_matrix.columns, 'Wrong var column for DEXSeq'
except AttributeError:
sys.exit('Wrong Pandas dataframe?')
self.dxd = None
self.dxd_res = None
self.dexseq_result = None
self.comparison = None
self.normalized_count_matrix = None
self.feature_column = feature_column
self.exons = exons
self.genes = genes
self.gene_id = count_matrix[self.feature_column]
self.count_matrix = pandas2ri.py2ri(count_matrix.drop(feature_column,axis=1))
self.design_matrix = pandas2ri.py2ri(design_matrix)
self.design_formula = Formula(design_formula)
self.BPPARAM = bp.MulticoreParam(workers=threads)
self.var_column = var_column
def run_dexseq(self, **kwargs):
self.dxd = dexseq.DEXSeqDataSet(countData=self.count_matrix,
sampleData=self.design_matrix,
design=self.design_formula,
featureID = self.exons,
groupID = self.genes)
print('Constructed DXD object')
self.dxd = dexseq.estimateSizeFactors_DEXSeqDataSet(self.dxd)
self.dxd = dexseq.estimateDispersions_DEXSeqDataSet(self.dxd, BPPARAM=self.BPPARAM)
print('Starting DEXSeq test')
self.dxd = dexseq.testForDEU(self.dxd, BPPARAM=self.BPPARAM)
self.dxd = dexseq.estimateExonFoldChanges(self.dxd,
fitExpToVar=self.var_column,
BPPARAM=self.BPPARAM)
print('Finished DEXSeq fold change')
def get_dexseq_result(self, **kwargs):
self.dexseq_result = to_dataframe(dexseq.DEXSeqResults(self.dxd), **kwargs)
self.dexseq_result = pandas2ri.ri2py(self.dexseq_result) ## back to pandas dataframe
self.dexseq_result['exons'] = self.exons
self.dexseq_result['genes'] = self.genes
self.dexseq_result.drop('genomicData', axis=1, inplace=True)
def normalized_count(self):
self.normalized_count_matrix = dexseq.counts(self.dxd,normalized=True)
return normalized_count_matrix
@borisstojilkovic
Copy link

borisstojilkovic commented Nov 22, 2022

Hi, I tried to run your code (deseq) on example data, but it seems that I am getting an error. Do you know how to solve this?

Capture

@wckdouglas
Copy link
Author

Hi @borisstojilkovic, I'm sorry it didn't work for you out-of-the-box! I'm not exactly sure what the problem is from the error message, but I did spot some differences between the gist version and the current implementation here that eliminate the use of resultsNames in get_deseq_result. The gist version here is a little out-of-date, maybe go ahead and use the maintained code in diffexpr would help! please let me know if that works, I'd be interested whether it works on a windows machine!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment