Skip to content

Instantly share code, notes, and snippets.

@pansapiens
Last active October 26, 2016 22:42
Show Gist options
  • Save pansapiens/0546a5a185fdb604f0aaa68da0425610 to your computer and use it in GitHub Desktop.
Save pansapiens/0546a5a185fdb604f0aaa68da0425610 to your computer and use it in GitHub Desktop.
Given a counts table from featureCounts (subread) and a GTF/GFF FeatureDB database, output a table with a gene name column ("symbol") containing only counts from "protein_coding" exon features.
#!/usr/bin/env python
import sys
import os
import argparse
import pandas as pd
import gffutils
def create_featuredb(gtf_file, output_file=':memory:'):
"""
Create an (sqlite) gffutils FeatureDB from a GTF/GFF file.
:param gtf_file: A GTF of GFF format feature file.
:type gtf_file: basestring
:param output_file: An output file path for the FeatureDB, or ':memory:' to
create the database in memory only.
:type output_file: basestring
:return: A FeatureDB object
:rtype: gffutils.FeatureDB
"""
db = gffutils.create_db(gtf_file, output_file)
return db
def annotate_combined_count_file(count_file, featuredb_file, out_file=None,
feature_type='exon',
feature_source='protein_coding'):
"""
Using a FeatureDB (sqlite), adds a gene name ('symbol') column to a
featureCounts table (tsv) and by default outputs only rows for 'exon'
features that are of source 'protein_coding'.
:param count_file:
:type count_file: basestring
:param featuredb_file:
:type featuredb_file: basestring
:param out_file:
:type out_file: basestring
:param feature_type:
:type feature_type: basestring
:param feature_source:
:type feature_source: basestring
:return: The filename of the output file. None if outputting to stdout.
:rtype: basestring
"""
db = gffutils.FeatureDB(featuredb_file, keep_order=True)
# if the genes don't have a gene_id or gene_name set
# a KeyError will be raised
symbol_lookup = dict()
type_lookup = dict()
for f in db.features_of_type(feature_type):
if f.source == feature_source:
symbol_lookup[f['gene_id'][0]] = f['gene_name'][0]
type_lookup[f['gene_id'][0]] = f.source
df = pd.io.parsers.read_table(count_file, sep="\t", index_col=0, header=0)
df['symbol'] = df.apply(lambda x: symbol_lookup.get(x.name, ""), axis=1)
df['type'] = df.apply(lambda x: type_lookup.get(x.name, ""), axis=1)
df = df[df.type == feature_source]
if out_file is None:
df.to_csv(sys.stdout, sep="\t", index_label="id")
else:
df.to_csv(out_file, sep="\t", index_label="id")
return out_file
def parse_commandline():
parser = argparse.ArgumentParser(
description='Given a counts table from featureCounts (subread) and a '
'GTF/GFF FeatureDB database, output a table with a gene '
'name column ("symbol") containing only counts from '
'"protein_coding" exons features.')
parser.add_argument("-f", "--feature-type",
default='exon',
help="A feature type (GTF column 3), eg exon")
parser.add_argument("-s", "--feature-source",
default='protein_coding',
help="A feature 'source' (GTF column 2), "
"eg protein_coding")
parser.add_argument("-o", "--output",
default=None,
help="An output file for the filtered counts. Defaults"
"to stdout.")
parser.add_argument("-g", "--gxf-feature-file",
default=None,
help="If specified, first create the FeatureDB based"
"on this GTF / GFF file.")
parser.add_argument("count_file", help="A counts table, tab delimited.")
parser.add_argument("featuredb_file",
help="A gffutils FeatureDB, previously generated from "
"a GTF / GFF file. If the -g or "
"--gxf-feature-file option is specified, generate"
"this file.")
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_commandline()
if args.gxf_feature_file is not None:
if os.path.exists(args.featuredb_file):
sys.stderr.write("%s would be overwritten. Terminating." %
args.featuredb_file)
sys.exit(1)
create_featuredb(args.gxf_feature_file, args.featuredb_file)
sys.stderr.write("Created %s\n" % args.featuredb_file)
outfn = annotate_combined_count_file(args.count_file,
args.featuredb_file,
out_file=args.output,
feature_type=args.feature_type,
feature_source=args.feature_source)
if outfn is not None:
sys.stderr.write("Created %s\n" % outfn)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment