Created
April 8, 2014 17:25
-
-
Save gpratt/10159232 to your computer and use it in GitHub Desktop.
iclip analysis tools
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""" | |
barcode_collapse.py read in a .bam file where the | |
first 9 nt of the read name | |
are the barcode and merge reads mapped to the same position that have the same barcode | |
""" | |
from collections import Counter, OrderedDict | |
import gzip | |
from optparse import OptionParser | |
import sys | |
import numpy as np | |
import pysam | |
#if we decide to add back in the start and stop functionality again keep in mind that | |
#reads are sorted by start site only, not start and stop site, so will need to use a pipeup based stragety | |
#for removing reads with same start and stop, simply iterating will not work | |
def collapse_base(reads, outBam, randomer, total_count, removed_count): | |
""" | |
Given a list of reads a base collapses them based on barcodes or just collapses them | |
NOT STRAND SPECIFIC, assumes all reads in list are from same positions, and collapses purely based on barcode | |
Oddly can't return and add two counters together, it creates another counter and causes the entire thing to be very slow | |
""" | |
barcode_set = Counter() | |
for read in reads: | |
barcode = read.qname.split(":")[0] if randomer else "total" | |
if barcode in barcode_set: | |
removed_count[barcode] += 1 | |
else: | |
outBam.write(read) | |
total_count[barcode] += 1 | |
barcode_set[barcode] += 1 | |
return barcode_set | |
def barcode_collapse(inBam, outBam, randomer): | |
""" | |
Removes reads with same start and same barcode | |
inBam : location of input bam file | |
outBam : location of output bam file | |
returns two dicts, total_count, a dict of all reads and their assocated barcodes | |
removed_count, a dict of all reads that have been removed and their attached barcodes | |
{barcode : count} | |
""" | |
out_total = open(outBam + ".total.wiggle", 'w') | |
out_barcodes = open(outBam + ".barcodes.wiggle", 'w') | |
out_entropy = open(outBam + ".entropy.wiggle", 'w') | |
inBam = pysam.Samfile(inBam, 'rb') | |
outBam = pysam.Samfile(outBam, 'wb', template=inBam) | |
cur_count = 0 | |
prev_chrom = None | |
prev_pos = 0 | |
#dictionary for handeling reads coming from negative strand | |
neg_dict = OrderedDict() | |
pos_list = [] #positive we iterate one base at a time, so no need for a dict, a list will do. | |
barcode_set = ([]) | |
removed_count = Counter() | |
total_count = Counter() | |
for i, read in enumerate(inBam.fetch()): | |
cur_chrom = read.rname | |
cur_count += 1 | |
#paramater options to allow for start and stop and barcodes vs normal | |
start = read.positions[-1] if read.is_reverse else read.positions[0] | |
cur_pos = read.positions[0] | |
#if we advance a position, reset barcode counting | |
if not cur_pos == prev_pos: | |
for (key_chrom, key_pos), reads in neg_dict.items(): | |
if key_pos >= cur_pos: | |
break | |
collapse_base(reads, outBam, randomer, total_count, removed_count) | |
del neg_dict[(key_chrom, key_pos)] | |
if prev_chrom != cur_chrom: | |
var_step = "variableStep chrom=%s\n" % (inBam.getrname(cur_chrom)) | |
out_total.write(var_step) | |
out_barcodes.write(var_step) | |
for x in neg_dict.keys(): | |
collapse_base(neg_dict[x], outBam, randomer, total_count, removed_count) | |
del neg_dict[x] | |
assert len(neg_dict) == 0 | |
#this is kind of a hack, doesn't take into account negative strand barcodes, but as a general idea it works... | |
barcode_set = collapse_base(pos_list, outBam, randomer, total_count, removed_count) | |
barcode_counts = np.array(barcode_set.values()) | |
barcode_probablity = barcode_counts / float(sum(barcode_counts)) | |
entropy = -1 * sum(barcode_probablity * np.log2(barcode_probablity)) | |
out_pos = prev_pos | |
out_entropy.write("\t".join(map(str, [out_pos, entropy])) + "\n") | |
out_total.write("\t".join(map(str, [out_pos, cur_count])) + "\n") | |
out_barcodes.write("\t".join(map(str, [out_pos, len(barcode_set)])) + "\n") | |
pos_list = [] | |
cur_count = 0 | |
if read.is_reverse: | |
try: | |
neg_dict[(cur_chrom, start)].append(read) | |
except KeyError as e: | |
neg_dict[(cur_chrom, start)] = [read] | |
else: | |
pos_list.append(read) | |
prev_pos = cur_pos | |
prev_chrom = cur_chrom | |
for x in neg_dict.keys(): | |
collapse_base(neg_dict[x], outBam, randomer, total_count, removed_count) | |
del neg_dict[x] | |
collapse_base(pos_list, outBam, randomer, total_count, removed_count) | |
out_pos = prev_pos | |
out_total.write("\t".join(map(str, [out_pos, cur_count])) + "\n") | |
out_barcodes.write("\t".join(map(str, [out_pos, len(barcode_set)])) + "\n") | |
inBam.close() | |
outBam.close() | |
out_total.close() | |
out_barcodes.close() | |
return total_count, removed_count | |
if __name__ == "__main__": | |
parser = OptionParser() | |
parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse") | |
parser.add_option("-c", "--randomer", action="store_true", dest="randomer", help="bam files are iclip barcoded") | |
parser.add_option("-o", "--out_file", dest="out_file") | |
parser.add_option("-m", "--metrics_file", dest="metrics_file") | |
(options,args) = parser.parse_args() | |
if not (options.bam.endswith(".bam")): | |
raise TypeError("%s, not bam file" % (options.bam)) | |
pysam.index(options.bam) | |
total_count, removed_count = barcode_collapse(options.bam, | |
options.out_file, | |
options.randomer, | |
) | |
with open(options.metrics_file, 'w') as metrics: | |
metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n") | |
for barcode in total_count.keys(): | |
metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n") | |
pysam.index(options.out_file) | |
sys.exit(0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Converts randomer + barcoded fastq files into something that can be barcode collapsed and mapped | |
""" | |
from collections import Counter | |
import gzip | |
import os | |
from optparse import OptionParser | |
def reformat_read(name, seq, plus, quality, barcodes, | |
RANDOMER_FRONT_LENGTH=3, RANDOMER_BACK_LENGTH=2): | |
""" reformats read to have correct barcode attached | |
name - read name | |
seq - read sequence | |
plus - + | |
quality - read quality sequence this is a poor mans datastructure, designed for speed | |
barcodes, dictionary of barcodes to search for | |
returns str - barcode barcode found, str - randomer identified, str - reformateed read | |
""" | |
barcode_lengths = {len(barcode) for barcode in barcodes.keys()} | |
barcode = None | |
#assumes larger barcodes are less likely, and searches for them first | |
#for each barcode length see if known barcodes appear | |
for barcode_length in sorted(barcode_lengths, reverse=True): | |
adapter_length = RANDOMER_FRONT_LENGTH + RANDOMER_BACK_LENGTH + barcode_length | |
randomer = seq[:adapter_length] | |
cur_barcode = randomer[RANDOMER_FRONT_LENGTH:adapter_length - RANDOMER_BACK_LENGTH] | |
if cur_barcode in barcodes: | |
barcode = cur_barcode | |
break | |
name = name[0] + randomer + ":" + name[1:] | |
seq = seq[adapter_length:] | |
quality = quality[adapter_length:] | |
#if none appear the barcode is unassigne | |
if barcode is None: | |
barcode = "unassigned" | |
result = name + seq + plus + quality | |
return barcode, randomer, result | |
if __name__ == "__main__": | |
usage = """ takes raw fastq files and demultiplex inline randomer + adapter sequences """ | |
parser = OptionParser(usage) | |
parser.add_option("-f", "--fastq", dest="fastq", help="fastq file to barcode seperate") | |
parser.add_option("-b", "--barcodes", dest="barcodes", help="file of barcode / barcode id (tab sepearted, one barcode / barcode id on each line") | |
parser.add_option("-o", "--out_file", dest="out_file") | |
parser.add_option("-m", "--metrics_file", dest="metrics_file") | |
parser.add_option("--front", type=int, dest="front_length", help="Number of randomers before the barcode", default=3) | |
parser.add_option("--back", type=int, dest="back_length", help="Number of randomers after the barcode", default=2) | |
(options,args) = parser.parse_args() | |
#if a gziped file then we reassign open to be the gzip open and continue using that for the rest of the | |
#program | |
my_open = gzip.open if os.path.splitext(options.fastq)[1] == ".gz" else open | |
#creates different barcode files to assign reads to | |
RANDOMER_FRONT_LENGTH = options.front_length | |
RANDOMER_BACK_LENGTH = options.back_length | |
barcodes = {} | |
randomer_counts = {} | |
with open(options.barcodes) as barcodes_file: | |
for line in barcodes_file: | |
line = line.strip("\n").split("\t") | |
split_file = options.out_file.split(".") | |
split_file.insert(-1, line[1]) | |
barcodes[line[0]] = open(".".join(split_file), 'w') | |
randomer_counts[line[0]] = Counter() | |
split_file = options.out_file.split(".") | |
split_file.insert(-1, "unassigned") | |
barcodes['unassigned'] = open(".".join(split_file), 'w') | |
randomer_counts['unassigned'] = Counter() | |
#reads through initial file parses everything out | |
with my_open(options.fastq) as fastq_file, open(options.metrics_file, 'w') as metrics_file: | |
while True: | |
try: | |
name = fastq_file.next() | |
seq = fastq_file.next() | |
fastq_file.next() #got to consume the read | |
plus = "+\n" #sometimes the descriptor is here, don't want it | |
quality = fastq_file.next() | |
barcode, randomer, result = reformat_read(name, seq, plus, quality, barcodes, | |
RANDOMER_FRONT_LENGTH, RANDOMER_BACK_LENGTH) | |
randomer_counts[barcode][randomer] += 1 | |
barcodes[barcode].write(result) | |
except StopIteration: | |
break | |
for barcode, randomers in randomer_counts.items(): | |
for randomer, count in randomers.items(): | |
metrics_file.write("%s\t%s\t%s\n" % (barcode, randomer, count)) | |
#cleans up at the end | |
for fn in barcodes.values(): | |
fn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment