necrolyte2/dnaplotter.py

## dnaplotter.py
#!/usr/bin/env python

import subprocess
import argparse

# Can test with
# python -m doctest dnaplotter.py

def count_ref_bases(reference):
    '''
    Counts all bases in a reference fasta file or already opened
    file stream

    # Tests
    >>> stream = ['>id1','aaaa\\n','bbbb\\n','>id2','cccc\\n','dddd']
    >>> count_ref_bases(stream)
    16
    '''
    stream = None
    # Determine if reference is a string or stream of lines
    # Determins if supplied argument reference is able to be iterated over
    if hasattr(reference,'__iter__'):
        stream = reference
    else:
        stream = open(reference)
    # Loop over every line
    count = 0
    for line in stream:
        # Remove newlines
        line = line.rstrip()
        if line.startswith('>'): # line[0] == '>'
            # Ignore ident lines
            continue # Just go back to for loop line
        else:
            # Sum length of all non ident lines
            count += len(line)
    return count

def fixed_depth(bamdepth, reflength):
    '''
    bamdepth is stream of lines of text that are in the format
    refname\tposition\tdepth

    reflength is the sum of all bases in a reference file

    #Tests
    >>> bamdepth = ['ref1\t3\t10', 'ref1\t4\t10', 'ref1\t6\t10']
    >>> reflength = 8
    >>> list(fixed_depth(bamdepth, reflength))
    [('ref1', 1, 0), ('ref1', 2, 0), ('ref1', 3, 10), ('ref1', 4, 10), ('ref1', 5, 0), ('ref1', 6, 10), ('ref1', 7, 0), ('ref1', 8, 0)]
    '''
    # Start base position at 1
    curpos = 1
    # Iterate over every line in bamdepth
    for line in bamdepth:
        # Remove newlines
        line = line.rstrip()
        # Split the line
        ref, pos, depth = line.split()
        # convert to int from string
        pos = int(pos)
        depth = int(depth)
        # If current position and bamdepth output don't match
        # then keep incrementing until they do
        while pos != curpos:
            yield ref, curpos, 0
            curpos += 1
        yield ref, curpos, depth
        curpos += 1
    # Now we are at the end of the bamdepth output
    # but may not be at the end of the reflength
    while curpos != reflength+1:
        yield ref, curpos, 0
        curpos += 1

def get_dnaplot(bamfile, reffile):
    reflength = count_ref_bases(reffile)
    # Define the shell command to run
    cmd = ['samtools', 'depth', bamfile]
    # Create a new process that runs our command using the shell
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    for line in fixed_depth(p.stdout, reflength):
        yield line

def print_dnaplot(bamfile, reffile):
    for line in get_dnaplot(bamfile, reffile):
        print "{0} {1}".format(line[1], line[2])

def parse_args():
    parser = argparse.ArgumentParser(
        '''Get dnaplot output file to run in dnaplotter'''
    )

    # Adds required argument so user has to supply a bam file
    parser.add_argument(
        'bam',
        help='Bam file to get depth data from'
    )

    # Adds required argument so user has to supply reference file
    parser.add_argument(
        'reference',
        help='Reference file to get genome length from'
    )

    return parser.parse_args()

def main():
    args = parse_args()
    print_dnaplot(args.bam, args.reference)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	import subprocess
	import argparse

	# Can test with
	# python -m doctest dnaplotter.py

	def count_ref_bases(reference):
	'''
	Counts all bases in a reference fasta file or already opened
	file stream

	# Tests
	>>> stream = ['>id1','aaaa\\n','bbbb\\n','>id2','cccc\\n','dddd']
	>>> count_ref_bases(stream)
	16
	'''
	stream = None
	# Determine if reference is a string or stream of lines
	# Determins if supplied argument reference is able to be iterated over
	if hasattr(reference,'__iter__'):
	stream = reference
	else:
	stream = open(reference)
	# Loop over every line
	count = 0
	for line in stream:
	# Remove newlines
	line = line.rstrip()
	if line.startswith('>'): # line[0] == '>'
	# Ignore ident lines
	continue # Just go back to for loop line
	else:
	# Sum length of all non ident lines
	count += len(line)
	return count

	def fixed_depth(bamdepth, reflength):
	'''
	bamdepth is stream of lines of text that are in the format
	refname\tposition\tdepth

	reflength is the sum of all bases in a reference file

	#Tests
	>>> bamdepth = ['ref1\t3\t10', 'ref1\t4\t10', 'ref1\t6\t10']
	>>> reflength = 8
	>>> list(fixed_depth(bamdepth, reflength))
	[('ref1', 1, 0), ('ref1', 2, 0), ('ref1', 3, 10), ('ref1', 4, 10), ('ref1', 5, 0), ('ref1', 6, 10), ('ref1', 7, 0), ('ref1', 8, 0)]
	'''
	# Start base position at 1
	curpos = 1
	# Iterate over every line in bamdepth
	for line in bamdepth:
	# Remove newlines
	line = line.rstrip()
	# Split the line
	ref, pos, depth = line.split()
	# convert to int from string
	pos = int(pos)
	depth = int(depth)
	# If current position and bamdepth output don't match
	# then keep incrementing until they do
	while pos != curpos:
	yield ref, curpos, 0
	curpos += 1
	yield ref, curpos, depth
	curpos += 1
	# Now we are at the end of the bamdepth output
	# but may not be at the end of the reflength
	while curpos != reflength+1:
	yield ref, curpos, 0
	curpos += 1

	def get_dnaplot(bamfile, reffile):
	reflength = count_ref_bases(reffile)
	# Define the shell command to run
	cmd = ['samtools', 'depth', bamfile]
	# Create a new process that runs our command using the shell
	p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
	for line in fixed_depth(p.stdout, reflength):
	yield line

	def print_dnaplot(bamfile, reffile):
	for line in get_dnaplot(bamfile, reffile):
	print "{0} {1}".format(line[1], line[2])

	def parse_args():
	parser = argparse.ArgumentParser(
	'''Get dnaplot output file to run in dnaplotter'''
	)

	# Adds required argument so user has to supply a bam file
	parser.add_argument(
	'bam',
	help='Bam file to get depth data from'
	)

	# Adds required argument so user has to supply reference file
	parser.add_argument(
	'reference',
	help='Reference file to get genome length from'
	)

	return parser.parse_args()

	def main():
	args = parse_args()
	print_dnaplot(args.bam, args.reference)

	if __name__ == '__main__':
	main()