daler/classify_reads.py

## classify_reads.py
#!/usr/bin/python

# GPLv3 license
# Copyright 2011 Ryan Dale, all rights reserved
# dalerr@niddk.nih.gov

import HTSeq
import argparse
import sys
import time
from collections import defaultdict


def classify(gff, bam, chroms=None, include=None, exclude=None,
             stranded=False, verbose=False):
    """
    Classify reads in *bam* based on featuretype as annotated in *gff*.
    Default is to consider all featuretypes annotated; restrict these by
    passing a list of featuretypes as *include* or *exclude*.

    *chroms* is a dictionary of chromsizes to be passed to
    HTSeq.GenomicArrayOfSets, or, if it's a string and you have `pybedtools`
    installed, it will use the chromsizes for that assembly.  If *chroms* is
    None, then it will be set to 'auto', which lets each chromosome go to
    infnity.

    Observed sequence space of each class is reported as well. Using 'auto'
    will cause your unannotated sequence space to be huge.
    """
    t0 = time.time()
    if verbose:
        sys.stderr.write('Parsing GFF...')
        sys.stderr.flush()

    gff = HTSeq.GFF_Reader(gff)
    bam = HTSeq.BAM_Reader(bam)

    if chroms is None:
        chroms = 'auto'

    # Get chromsizes dict using pybedtools if a string was specified
    if isinstance(chroms, basestring) and chroms != 'auto':
        try:
            import pybedtools
        except ImportError:
            sys.stderr.write("Can't import pybedtools to get chromsizes")
            raise
        pbt_chroms = pybedtools.chromsizes(chroms)
        chroms = {}
        for key, val in pbt_chroms.items():
            chroms[key] = val[1]

    # Init the data structure that is key to all of this:
    gaos = HTSeq.GenomicArrayOfSets(chroms, stranded=stranded)

    # Parse the GFF file, only adding the featuretypes specified
    if include and exclude:
        raise ValueError('Both include and exclude cannot be specified')

    if include:
        for feature in gff:
            if feature.type in include:
                try:
                    gaos[feature.iv] += feature.type
                except IndexError:
                    # out of range
                    pass
                except KeyError:
                    # missing chrom
                    pass

    if exclude:
        for feature in gff:
            if feature.type not in exclude:
                try:
                    gaos[feature.iv] += feature.type
                except IndexError:
                    # out of range
                    pass
                except KeyError:
                    # missing chrom
                    pass

    if not include and not exclude:
        for feature in gff:
            try:
                gaos[feature.iv] += feature.type
            except IndexError:
                # out of range
                pass
            except KeyError:
                # missing chrom
                pass

    if verbose:
        sys.stderr.write('(%.1fs)\n' % (time.time() - t0))
        t0 = time.time()
        sys.stderr.write('Classifying reads...')
        sys.stderr.flush()

    # Iterate through BAM and create the set of classes that overlap the read.
    #
    # TODO: paired-end version?
    results = defaultdict(int)
    for read in bam:
        intersection_set = None
        for iv, step_set in gaos[read.iv].steps():
            if intersection_set is None:
                intersection_set = step_set.copy()
            else:
                intersection_set.intersection_update(step_set)

            classes = list(intersection_set)

            # Add read to the "all" categories
            for cls in classes:
                cls += '_all'
                results[cls] += 1

            # this gets a stable, hashable key for the set of classes
            classes = ';'.join(sorted(classes))

        results[classes] += 1
        results['total'] += 1

    if verbose:
        sys.stderr.write('(%.1fs)\n' % (time.time() - t0))
        t0 = time.time()
        sys.stderr.write('Calculating sequence space...')
        sys.stderr.flush()

    # Get sequence space by iterating over the G.A.O.S.
    seq_space = defaultdict(int)
    for iv, classes in gaos.steps():
        classes = list(classes)

        # The "all" categories
        for cls in classes:
            cls += '_all'
            seq_space[cls] += iv.length

        cls = ';'.join(sorted(classes))
        seq_space[cls] += iv.length
        seq_space['total'] += iv.length

    if verbose:
        sys.stderr.write('(%.1fs)\n\n' % (time.time() - t0))
        sys.stderr.flush()

    try:
        results['unannotated'] = results.pop('')
    except KeyError:
        results['unannotated'] = 0

    try:
        seq_space['unannotated'] = seq_space.pop('')
    except KeyError:
        seq_space['unannotated'] = 0

    return results, seq_space


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--gff', help='GFF or GTF file')
    ap.add_argument('--bam', help='BAM file of reads to classify')
    ap.add_argument('--include', nargs='*', help='Featuretypes to include')
    ap.add_argument('--exclude', nargs='*', help='Featuretypes to exclude')
    ap.add_argument('--stranded', action='store_true',
                    help='Use stranded classification')
    ap.add_argument('--assembly', help='Genome assembly (requires pybedtools)')
    ap.add_argument('-v', '--verbose', action='store_true',
                    help='Verbose mode')
    args = ap.parse_args()

    results, seq_space = classify(gff=args.gff,
                                  bam=args.bam,
                                  include=args.include,
                                  exclude=args.exclude,
                                  verbose=args.verbose,
                                  stranded=args.stranded,
                                  chroms=args.assembly)

    # Show most abundant first
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

    for key, val in sorted_results:
        sys.stdout.write('%s\t%s\t%s\n' % (key, val, seq_space[key]))

if __name__ == "__main__":
    main()
	#!/usr/bin/python

	# GPLv3 license
	# Copyright 2011 Ryan Dale, all rights reserved
	# dalerr@niddk.nih.gov

	import HTSeq
	import argparse
	import sys
	import time
	from collections import defaultdict


	def classify(gff, bam, chroms=None, include=None, exclude=None,
	stranded=False, verbose=False):
	"""
	Classify reads in bam based on featuretype as annotated in gff.
	Default is to consider all featuretypes annotated; restrict these by
	passing a list of featuretypes as include or exclude.

	chroms is a dictionary of chromsizes to be passed to
	HTSeq.GenomicArrayOfSets, or, if it's a string and you have `pybedtools`
	installed, it will use the chromsizes for that assembly. If chroms is
	None, then it will be set to 'auto', which lets each chromosome go to
	infnity.

	Observed sequence space of each class is reported as well. Using 'auto'
	will cause your unannotated sequence space to be huge.
	"""
	t0 = time.time()
	if verbose:
	sys.stderr.write('Parsing GFF...')
	sys.stderr.flush()

	gff = HTSeq.GFF_Reader(gff)
	bam = HTSeq.BAM_Reader(bam)

	if chroms is None:
	chroms = 'auto'

	# Get chromsizes dict using pybedtools if a string was specified
	if isinstance(chroms, basestring) and chroms != 'auto':
	try:
	import pybedtools
	except ImportError:
	sys.stderr.write("Can't import pybedtools to get chromsizes")
	raise
	pbt_chroms = pybedtools.chromsizes(chroms)
	chroms = {}
	for key, val in pbt_chroms.items():
	chroms[key] = val[1]

	# Init the data structure that is key to all of this:
	gaos = HTSeq.GenomicArrayOfSets(chroms, stranded=stranded)

	# Parse the GFF file, only adding the featuretypes specified
	if include and exclude:
	raise ValueError('Both include and exclude cannot be specified')

	if include:
	for feature in gff:
	if feature.type in include:
	try:
	gaos[feature.iv] += feature.type
	except IndexError:
	# out of range
	pass
	except KeyError:
	# missing chrom
	pass

	if exclude:
	for feature in gff:
	if feature.type not in exclude:
	try:
	gaos[feature.iv] += feature.type
	except IndexError:
	# out of range
	pass
	except KeyError:
	# missing chrom
	pass

	if not include and not exclude:
	for feature in gff:
	try:
	gaos[feature.iv] += feature.type
	except IndexError:
	# out of range
	pass
	except KeyError:
	# missing chrom
	pass

	if verbose:
	sys.stderr.write('(%.1fs)\n' % (time.time() - t0))
	t0 = time.time()
	sys.stderr.write('Classifying reads...')
	sys.stderr.flush()

	# Iterate through BAM and create the set of classes that overlap the read.
	#
	# TODO: paired-end version?
	results = defaultdict(int)
	for read in bam:
	intersection_set = None
	for iv, step_set in gaos[read.iv].steps():
	if intersection_set is None:
	intersection_set = step_set.copy()
	else:
	intersection_set.intersection_update(step_set)

	classes = list(intersection_set)

	# Add read to the "all" categories
	for cls in classes:
	cls += '_all'
	results[cls] += 1

	# this gets a stable, hashable key for the set of classes
	classes = ';'.join(sorted(classes))

	results[classes] += 1
	results['total'] += 1

	if verbose:
	sys.stderr.write('(%.1fs)\n' % (time.time() - t0))
	t0 = time.time()
	sys.stderr.write('Calculating sequence space...')
	sys.stderr.flush()

	# Get sequence space by iterating over the G.A.O.S.
	seq_space = defaultdict(int)
	for iv, classes in gaos.steps():
	classes = list(classes)

	# The "all" categories
	for cls in classes:
	cls += '_all'
	seq_space[cls] += iv.length

	cls = ';'.join(sorted(classes))
	seq_space[cls] += iv.length
	seq_space['total'] += iv.length

	if verbose:
	sys.stderr.write('(%.1fs)\n\n' % (time.time() - t0))
	sys.stderr.flush()

	try:
	results['unannotated'] = results.pop('')
	except KeyError:
	results['unannotated'] = 0

	try:
	seq_space['unannotated'] = seq_space.pop('')
	except KeyError:
	seq_space['unannotated'] = 0

	return results, seq_space


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument('--gff', help='GFF or GTF file')
	ap.add_argument('--bam', help='BAM file of reads to classify')
	ap.add_argument('--include', nargs='*', help='Featuretypes to include')
	ap.add_argument('--exclude', nargs='*', help='Featuretypes to exclude')
	ap.add_argument('--stranded', action='store_true',
	help='Use stranded classification')
	ap.add_argument('--assembly', help='Genome assembly (requires pybedtools)')
	ap.add_argument('-v', '--verbose', action='store_true',
	help='Verbose mode')
	args = ap.parse_args()

	results, seq_space = classify(gff=args.gff,
	bam=args.bam,
	include=args.include,
	exclude=args.exclude,
	verbose=args.verbose,
	stranded=args.stranded,
	chroms=args.assembly)

	# Show most abundant first
	sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

	for key, val in sorted_results:
	sys.stdout.write('%s\t%s\t%s\n' % (key, val, seq_space[key]))

	if __name__ == "__main__":
	main()