rvprasad/mapper-par.py

## mapper-par.py
# python2.7
#
# Before using the script, execute the following.
#   PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree
#   PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac
#
# To run the script, use the following command
#   PYTHONPATH=~/.pip python2.7 mapper-par.py <desc> <gff> <bam> <output>
#                                             <# cores>

from __future__ import print_function
from datetime import datetime
import multiprocessing
import plac
import pybam  # https://github.com/JohnLonginotto/pybam
from mapper import *


def worker(refs, alignments, results):
    reference = refs.get()
    refs.task_done()
    while True:
        alignment = alignments.get()
        alignments.task_done()
        output = process_alignment(reference, *alignment)
        results.put(output)


def process_alignments_par(bam_filename, output_filename, reference,
                           num_of_cores):
    print('Reading alignment file ' + bam_filename)

    refs = multiprocessing.JoinableQueue()
    alignments = multiprocessing.JoinableQueue()
    results = multiprocessing.JoinableQueue()
    for _ in range(1, num_of_cores):
        proc = multiprocessing.Process(target=worker,
                                       args=(refs, alignments, results))
        proc.start()
        refs.put(reference)
    refs.close()

    with open(output_filename, 'wt') as f:
        f.write('QName StartPos EndPos RName ID Parent StartPos \
EndPos Name Description\n'.replace(' ', '\t'))
        lines = 0
        limit = num_of_cores * 400
        print("{0} lines processed {1}".format(lines, datetime.now()))
        for sam_qname, sam_rname, sam_pos1, sam_seq in \
                pybam.read(bam_filename, ['sam_qname', 'sam_rname',
                                          'sam_pos1', 'sam_seq']):
            alignments.put([sam_qname, sam_rname, sam_pos1, len(sam_seq)])
            lines += 1
            if (lines % limit) == 0:
                for _ in range(0, limit):
                    output = results.get()
                    results.task_done()
                    f.writelines(output)
                print("processed {0} lines {1}".format(lines, datetime.now()))
        alignments.close()
        for _ in range(0, (lines % limit)):
            output = results.get()
            results.task_done()
            f.writelines(output)
        print("processed {0} lines {1}".format(lines, datetime.now()))
        results.close()


def process(desc_filename, gff_filename, bam_filename, output_filename,
            num_of_cores):
    print('Reading description file ' + desc_filename)
    id_to_desc = get_description(desc_filename)
    reference = get_reference_tree(gff_filename, id_to_desc)
    process_alignments_par(bam_filename, output_filename, reference,
                           int(num_of_cores))


if __name__ == "__main__":
    plac.call(process)

## mapper-pool.py
# python2.7
#
# Before using the script, execute the following.
#   PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree
#   PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac
#
# To run the script, use the following command
#   PYTHONPATH=~/.pip python2.7 mapper-pool.py <desc> <gff> <bam> <output>
#                                              <# cores>

from __future__ import print_function
from datetime import datetime
import multiprocessing
import plac
import pybam  # https://github.com/JohnLonginotto/pybam
from mapper import *

reference = None


def initializer(refs):
    global reference
    reference = refs


def process_alignment_wrapper(inputs):
    return process_alignment(reference, *inputs)


def process_alignments_par(bam_filename, output_filename, reference,
                           num_of_cores):
    print('Reading alignment file ' + bam_filename)

    pool = multiprocessing.Pool(num_of_cores, initializer, (reference,))

    with open(output_filename, 'wt') as f:
        f.write('QName StartPos EndPos RName ID Parent StartPos \
EndPos Name Description\n'.replace(' ', '\t'))
        chunk_size = 400
        lines = 0
        alignments = []
        limit = num_of_cores * chunk_size
        print("{0} lines processed {1}".format(lines, datetime.now()))
        for sam_qname, sam_rname, sam_pos1, sam_seq in \
                pybam.read(bam_filename, ['sam_qname', 'sam_rname',
                                          'sam_pos1', 'sam_seq']):
            alignments.append([sam_qname, sam_rname, sam_pos1, len(sam_seq)])
            lines += 1
            if len(alignments) == limit:
                output = [x for y in
                          pool.imap_unordered(process_alignment_wrapper,
                                              alignments, chunk_size) for x in
                          y]
                f.writelines(output)
                print("processed {0} lines {1}".format(lines, datetime.now()))
                alignments = []
        output = pool.imap_unordered(process_alignments_wrapper, alignments)
        f.writelines(output)
        print("processed {0} lines {1}".format(lines, datetime.now()))
        pool.close()


def process(desc_filename, gff_filename, bam_filename, output_filename,
            num_of_cores):
    print('Reading description file ' + desc_filename)
    id_to_desc = get_description(desc_filename)
    reference = get_reference_tree(gff_filename, id_to_desc)
    process_alignments_par(bam_filename, output_filename, reference,
                           int(num_of_cores))


if __name__ == "__main__":
    plac.call(process)

## mapper-seq.py
# python2.7
#
# Before using the script, execute the following.
#   PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree
#   PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac
#
# To run the script, use the following command
#   PYTHONPATH=~/.pip python2.7 mapper-seq.py <desc> <gff> <bam> <output>

from __future__ import print_function
from datetime import datetime
import plac
import pybam  # https://github.com/JohnLonginotto/pybam
from mapper import *


def process_alignments_seq(bam_filename, output_filename, reference):
    print('Reading alignment file ' + bam_filename)
    with open(output_filename, 'wt') as f:
        f.write('QName StartPos EndPos RName ID Parent StartPos \
EndPos Name Description\n'.replace(' ', '\t'))
        lines = 0
        print("{0} lines processed {1}".format(lines, datetime.now()))
        for sam_qname, sam_rname, sam_pos1, sam_seq in \
                pybam.read(bam_filename, ['sam_qname', 'sam_rname',
                                          'sam_pos1', 'sam_seq']):
            output = process_alignment(reference, sam_qname, sam_rname,
                                       sam_pos1, len(sam_seq))
            f.writelines(output)

            lines += 1
            if (lines % 1000) == 0:
                print("{0} lines processed {1}".format(lines, datetime.now()))


def process(desc_filename, gff_filename, bam_filename, output_filename):
    print('Reading description file ' + desc_filename)
    id_to_desc = get_description(desc_filename)
    reference = get_reference_tree(gff_filename, id_to_desc)
    process_alignments_seq(bam_filename, output_filename, reference)


if __name__ == "__main__":
    plac.call(process)

## mapper.py
# python2.7

from __future__ import print_function
import itertools
import intervaltree
import pybam  # https://github.com/JohnLonginotto/pybam


def get_reference_tree(gff_filename, id_to_desc):
    print('Reading reference file ' + gff_filename)
    reference = intervaltree.IntervalTree()
    i = 0
    with open(gff_filename, 'rt') as f:
        for line in f:
            if line[0] != '#':
                tmp1 = line.split('\t')
                tmp2 = {z[0]: z[1] for z in (map(lambda x: x.strip(),
                                                 x.split('='))
                        for x in tmp1[8].split(';'))}
                value = {x: tmp2[x] for x in set(['ID', 'Parent', 'Name'])
                         .intersection(tmp2.keys())}
                value['StartPos'] = tmp1[3]
                value['EndPos'] = tmp1[4]
                if 'ID' in value:
                    tmp3 = value['ID'].split('.')[0]
                    if '-' not in tmp3:
                        tmp3 = tmp3 + '-RA'
                    value['Description'] = id_to_desc[tmp3]
                tmp3 = (tmp1[0], '\t'.join(value.get(k, '') for k in
                                           ['ID', 'Parent', 'StartPos',
                                            'EndPos', 'Name', 'Description']))
                reference[int(tmp1[3]):(float(tmp1[4]) + 0.000001)] = tmp3

    return reference


def get_description(desc_filename):
    id_to_desc = {}
    with open(desc_filename, 'rt') as f:
        for line in f:
            tmp1 = line.split('\t')
            id_to_desc[tmp1[0]] = '\t'.join(map(lambda x: x.strip(), tmp1[1:]))

    return id_to_desc


def process_alignment(reference, sam_qname, sam_rname, sam_pos1, sam_seq_len):
    # get annotation for alignment.sam_pos1
    endPos = sam_pos1 + sam_seq_len - 1
    tmp1 = set(itertools.chain.from_iterable(reference[x]
                                             for x in xrange(sam_pos1,
                                                             endPos + 1)))
    tmp2 = list(x for x in tmp1 if x.data[0] == sam_rname)
    tmp3 = (x.data[1] for x in sorted(tmp2, key=lambda x: x.end - x.begin))
    output = ['\t'.join(map(str, [sam_qname, sam_pos1, endPos, sam_rname, t]))
              + '\n' for t in tmp3]
    return output
	# python2.7
	#
	# Before using the script, execute the following.
	# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree
	# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac
	#
	# To run the script, use the following command
	# PYTHONPATH=~/.pip python2.7 mapper-par.py <desc> <gff> <bam> <output>
	# <# cores>

	from __future__ import print_function
	from datetime import datetime
	import multiprocessing
	import plac
	import pybam # https://github.com/JohnLonginotto/pybam
	from mapper import *


	def worker(refs, alignments, results):
	reference = refs.get()
	refs.task_done()
	while True:
	alignment = alignments.get()
	alignments.task_done()
	output = process_alignment(reference, *alignment)
	results.put(output)


	def process_alignments_par(bam_filename, output_filename, reference,
	num_of_cores):
	print('Reading alignment file ' + bam_filename)

	refs = multiprocessing.JoinableQueue()
	alignments = multiprocessing.JoinableQueue()
	results = multiprocessing.JoinableQueue()
	for _ in range(1, num_of_cores):
	proc = multiprocessing.Process(target=worker,
	args=(refs, alignments, results))
	proc.start()
	refs.put(reference)
	refs.close()

	with open(output_filename, 'wt') as f:
	f.write('QName StartPos EndPos RName ID Parent StartPos \
	EndPos Name Description\n'.replace(' ', '\t'))
	lines = 0
	limit = num_of_cores * 400
	print("{0} lines processed {1}".format(lines, datetime.now()))
	for sam_qname, sam_rname, sam_pos1, sam_seq in \
	pybam.read(bam_filename, ['sam_qname', 'sam_rname',
	'sam_pos1', 'sam_seq']):
	alignments.put([sam_qname, sam_rname, sam_pos1, len(sam_seq)])
	lines += 1
	if (lines % limit) == 0:
	for _ in range(0, limit):
	output = results.get()
	results.task_done()
	f.writelines(output)
	print("processed {0} lines {1}".format(lines, datetime.now()))
	alignments.close()
	for _ in range(0, (lines % limit)):
	output = results.get()
	results.task_done()
	f.writelines(output)
	print("processed {0} lines {1}".format(lines, datetime.now()))
	results.close()


	def process(desc_filename, gff_filename, bam_filename, output_filename,
	num_of_cores):
	print('Reading description file ' + desc_filename)
	id_to_desc = get_description(desc_filename)
	reference = get_reference_tree(gff_filename, id_to_desc)
	process_alignments_par(bam_filename, output_filename, reference,
	int(num_of_cores))


	if __name__ == "__main__":
	plac.call(process)
	# python2.7

	from __future__ import print_function
	import itertools
	import intervaltree
	import pybam # https://github.com/JohnLonginotto/pybam


	def get_reference_tree(gff_filename, id_to_desc):
	print('Reading reference file ' + gff_filename)
	reference = intervaltree.IntervalTree()
	i = 0
	with open(gff_filename, 'rt') as f:
	for line in f:
	if line[0] != '#':
	tmp1 = line.split('\t')
	tmp2 = {z[0]: z[1] for z in (map(lambda x: x.strip(),
	x.split('='))
	for x in tmp1[8].split(';'))}
	value = {x: tmp2[x] for x in set(['ID', 'Parent', 'Name'])
	.intersection(tmp2.keys())}
	value['StartPos'] = tmp1[3]
	value['EndPos'] = tmp1[4]
	if 'ID' in value:
	tmp3 = value['ID'].split('.')[0]
	if '-' not in tmp3:
	tmp3 = tmp3 + '-RA'
	value['Description'] = id_to_desc[tmp3]
	tmp3 = (tmp1[0], '\t'.join(value.get(k, '') for k in
	['ID', 'Parent', 'StartPos',
	'EndPos', 'Name', 'Description']))
	reference[int(tmp1[3]):(float(tmp1[4]) + 0.000001)] = tmp3

	return reference


	def get_description(desc_filename):
	id_to_desc = {}
	with open(desc_filename, 'rt') as f:
	for line in f:
	tmp1 = line.split('\t')
	id_to_desc[tmp1[0]] = '\t'.join(map(lambda x: x.strip(), tmp1[1:]))

	return id_to_desc


	def process_alignment(reference, sam_qname, sam_rname, sam_pos1, sam_seq_len):
	# get annotation for alignment.sam_pos1
	endPos = sam_pos1 + sam_seq_len - 1
	tmp1 = set(itertools.chain.from_iterable(reference[x]
	for x in xrange(sam_pos1,
	endPos + 1)))
	tmp2 = list(x for x in tmp1 if x.data[0] == sam_rname)
	tmp3 = (x.data[1] for x in sorted(tmp2, key=lambda x: x.end - x.begin))
	output = ['\t'.join(map(str, [sam_qname, sam_pos1, endPos, sam_rname, t]))
	+ '\n' for t in tmp3]
	return output