Skip to content

Instantly share code, notes, and snippets.

@rvprasad
Last active November 3, 2017 23:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rvprasad/1dd4a5466d528829b8d01c037d95913b to your computer and use it in GitHub Desktop.
Save rvprasad/1dd4a5466d528829b8d01c037d95913b to your computer and use it in GitHub Desktop.
Maps each alignment (in BAM) to reference gene sequence data (in GFF) and its description (in Description file).
# python2.7
#
# Before using the script, execute the following.
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac
#
# To run the script, use the following command
# PYTHONPATH=~/.pip python2.7 mapper-par.py <desc> <gff> <bam> <output>
# <# cores>
from __future__ import print_function
from datetime import datetime
import multiprocessing
import plac
import pybam # https://github.com/JohnLonginotto/pybam
from mapper import *
def worker(refs, alignments, results):
reference = refs.get()
refs.task_done()
while True:
alignment = alignments.get()
alignments.task_done()
output = process_alignment(reference, *alignment)
results.put(output)
def process_alignments_par(bam_filename, output_filename, reference,
num_of_cores):
print('Reading alignment file ' + bam_filename)
refs = multiprocessing.JoinableQueue()
alignments = multiprocessing.JoinableQueue()
results = multiprocessing.JoinableQueue()
for _ in range(1, num_of_cores):
proc = multiprocessing.Process(target=worker,
args=(refs, alignments, results))
proc.start()
refs.put(reference)
refs.close()
with open(output_filename, 'wt') as f:
f.write('QName StartPos EndPos RName ID Parent StartPos \
EndPos Name Description\n'.replace(' ', '\t'))
lines = 0
limit = num_of_cores * 400
print("{0} lines processed {1}".format(lines, datetime.now()))
for sam_qname, sam_rname, sam_pos1, sam_seq in \
pybam.read(bam_filename, ['sam_qname', 'sam_rname',
'sam_pos1', 'sam_seq']):
alignments.put([sam_qname, sam_rname, sam_pos1, len(sam_seq)])
lines += 1
if (lines % limit) == 0:
for _ in range(0, limit):
output = results.get()
results.task_done()
f.writelines(output)
print("processed {0} lines {1}".format(lines, datetime.now()))
alignments.close()
for _ in range(0, (lines % limit)):
output = results.get()
results.task_done()
f.writelines(output)
print("processed {0} lines {1}".format(lines, datetime.now()))
results.close()
def process(desc_filename, gff_filename, bam_filename, output_filename,
num_of_cores):
print('Reading description file ' + desc_filename)
id_to_desc = get_description(desc_filename)
reference = get_reference_tree(gff_filename, id_to_desc)
process_alignments_par(bam_filename, output_filename, reference,
int(num_of_cores))
if __name__ == "__main__":
plac.call(process)
# python2.7
#
# Before using the script, execute the following.
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac
#
# To run the script, use the following command
# PYTHONPATH=~/.pip python2.7 mapper-pool.py <desc> <gff> <bam> <output>
# <# cores>
from __future__ import print_function
from datetime import datetime
import multiprocessing
import plac
import pybam # https://github.com/JohnLonginotto/pybam
from mapper import *
reference = None
def initializer(refs):
global reference
reference = refs
def process_alignment_wrapper(inputs):
return process_alignment(reference, *inputs)
def process_alignments_par(bam_filename, output_filename, reference,
num_of_cores):
print('Reading alignment file ' + bam_filename)
pool = multiprocessing.Pool(num_of_cores, initializer, (reference,))
with open(output_filename, 'wt') as f:
f.write('QName StartPos EndPos RName ID Parent StartPos \
EndPos Name Description\n'.replace(' ', '\t'))
chunk_size = 400
lines = 0
alignments = []
limit = num_of_cores * chunk_size
print("{0} lines processed {1}".format(lines, datetime.now()))
for sam_qname, sam_rname, sam_pos1, sam_seq in \
pybam.read(bam_filename, ['sam_qname', 'sam_rname',
'sam_pos1', 'sam_seq']):
alignments.append([sam_qname, sam_rname, sam_pos1, len(sam_seq)])
lines += 1
if len(alignments) == limit:
output = [x for y in
pool.imap_unordered(process_alignment_wrapper,
alignments, chunk_size) for x in
y]
f.writelines(output)
print("processed {0} lines {1}".format(lines, datetime.now()))
alignments = []
output = pool.imap_unordered(process_alignments_wrapper, alignments)
f.writelines(output)
print("processed {0} lines {1}".format(lines, datetime.now()))
pool.close()
def process(desc_filename, gff_filename, bam_filename, output_filename,
num_of_cores):
print('Reading description file ' + desc_filename)
id_to_desc = get_description(desc_filename)
reference = get_reference_tree(gff_filename, id_to_desc)
process_alignments_par(bam_filename, output_filename, reference,
int(num_of_cores))
if __name__ == "__main__":
plac.call(process)
# python2.7
#
# Before using the script, execute the following.
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac
#
# To run the script, use the following command
# PYTHONPATH=~/.pip python2.7 mapper-seq.py <desc> <gff> <bam> <output>
from __future__ import print_function
from datetime import datetime
import plac
import pybam # https://github.com/JohnLonginotto/pybam
from mapper import *
def process_alignments_seq(bam_filename, output_filename, reference):
print('Reading alignment file ' + bam_filename)
with open(output_filename, 'wt') as f:
f.write('QName StartPos EndPos RName ID Parent StartPos \
EndPos Name Description\n'.replace(' ', '\t'))
lines = 0
print("{0} lines processed {1}".format(lines, datetime.now()))
for sam_qname, sam_rname, sam_pos1, sam_seq in \
pybam.read(bam_filename, ['sam_qname', 'sam_rname',
'sam_pos1', 'sam_seq']):
output = process_alignment(reference, sam_qname, sam_rname,
sam_pos1, len(sam_seq))
f.writelines(output)
lines += 1
if (lines % 1000) == 0:
print("{0} lines processed {1}".format(lines, datetime.now()))
def process(desc_filename, gff_filename, bam_filename, output_filename):
print('Reading description file ' + desc_filename)
id_to_desc = get_description(desc_filename)
reference = get_reference_tree(gff_filename, id_to_desc)
process_alignments_seq(bam_filename, output_filename, reference)
if __name__ == "__main__":
plac.call(process)
# python2.7
from __future__ import print_function
import itertools
import intervaltree
import pybam # https://github.com/JohnLonginotto/pybam
def get_reference_tree(gff_filename, id_to_desc):
print('Reading reference file ' + gff_filename)
reference = intervaltree.IntervalTree()
i = 0
with open(gff_filename, 'rt') as f:
for line in f:
if line[0] != '#':
tmp1 = line.split('\t')
tmp2 = {z[0]: z[1] for z in (map(lambda x: x.strip(),
x.split('='))
for x in tmp1[8].split(';'))}
value = {x: tmp2[x] for x in set(['ID', 'Parent', 'Name'])
.intersection(tmp2.keys())}
value['StartPos'] = tmp1[3]
value['EndPos'] = tmp1[4]
if 'ID' in value:
tmp3 = value['ID'].split('.')[0]
if '-' not in tmp3:
tmp3 = tmp3 + '-RA'
value['Description'] = id_to_desc[tmp3]
tmp3 = (tmp1[0], '\t'.join(value.get(k, '') for k in
['ID', 'Parent', 'StartPos',
'EndPos', 'Name', 'Description']))
reference[int(tmp1[3]):(float(tmp1[4]) + 0.000001)] = tmp3
return reference
def get_description(desc_filename):
id_to_desc = {}
with open(desc_filename, 'rt') as f:
for line in f:
tmp1 = line.split('\t')
id_to_desc[tmp1[0]] = '\t'.join(map(lambda x: x.strip(), tmp1[1:]))
return id_to_desc
def process_alignment(reference, sam_qname, sam_rname, sam_pos1, sam_seq_len):
# get annotation for alignment.sam_pos1
endPos = sam_pos1 + sam_seq_len - 1
tmp1 = set(itertools.chain.from_iterable(reference[x]
for x in xrange(sam_pos1,
endPos + 1)))
tmp2 = list(x for x in tmp1 if x.data[0] == sam_rname)
tmp3 = (x.data[1] for x in sorted(tmp2, key=lambda x: x.end - x.begin))
output = ['\t'.join(map(str, [sam_qname, sam_pos1, endPos, sam_rname, t]))
+ '\n' for t in tmp3]
return output
@rvprasad
Copy link
Author

Use either mapper-par.py, mapper-pool.py, or mapper-seq.py at the command line.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment