Skip to content

Instantly share code, notes, and snippets.

@gpratt
Created April 8, 2014 17:25
Show Gist options
  • Save gpratt/10159232 to your computer and use it in GitHub Desktop.
Save gpratt/10159232 to your computer and use it in GitHub Desktop.
iclip analysis tools
""""
barcode_collapse.py read in a .bam file where the
first 9 nt of the read name
are the barcode and merge reads mapped to the same position that have the same barcode
"""
from collections import Counter, OrderedDict
import gzip
from optparse import OptionParser
import sys
import numpy as np
import pysam
#if we decide to add back in the start and stop functionality again keep in mind that
#reads are sorted by start site only, not start and stop site, so will need to use a pipeup based stragety
#for removing reads with same start and stop, simply iterating will not work
def collapse_base(reads, outBam, randomer, total_count, removed_count):
"""
Given a list of reads a base collapses them based on barcodes or just collapses them
NOT STRAND SPECIFIC, assumes all reads in list are from same positions, and collapses purely based on barcode
Oddly can't return and add two counters together, it creates another counter and causes the entire thing to be very slow
"""
barcode_set = Counter()
for read in reads:
barcode = read.qname.split(":")[0] if randomer else "total"
if barcode in barcode_set:
removed_count[barcode] += 1
else:
outBam.write(read)
total_count[barcode] += 1
barcode_set[barcode] += 1
return barcode_set
def barcode_collapse(inBam, outBam, randomer):
"""
Removes reads with same start and same barcode
inBam : location of input bam file
outBam : location of output bam file
returns two dicts, total_count, a dict of all reads and their assocated barcodes
removed_count, a dict of all reads that have been removed and their attached barcodes
{barcode : count}
"""
out_total = open(outBam + ".total.wiggle", 'w')
out_barcodes = open(outBam + ".barcodes.wiggle", 'w')
out_entropy = open(outBam + ".entropy.wiggle", 'w')
inBam = pysam.Samfile(inBam, 'rb')
outBam = pysam.Samfile(outBam, 'wb', template=inBam)
cur_count = 0
prev_chrom = None
prev_pos = 0
#dictionary for handeling reads coming from negative strand
neg_dict = OrderedDict()
pos_list = [] #positive we iterate one base at a time, so no need for a dict, a list will do.
barcode_set = ([])
removed_count = Counter()
total_count = Counter()
for i, read in enumerate(inBam.fetch()):
cur_chrom = read.rname
cur_count += 1
#paramater options to allow for start and stop and barcodes vs normal
start = read.positions[-1] if read.is_reverse else read.positions[0]
cur_pos = read.positions[0]
#if we advance a position, reset barcode counting
if not cur_pos == prev_pos:
for (key_chrom, key_pos), reads in neg_dict.items():
if key_pos >= cur_pos:
break
collapse_base(reads, outBam, randomer, total_count, removed_count)
del neg_dict[(key_chrom, key_pos)]
if prev_chrom != cur_chrom:
var_step = "variableStep chrom=%s\n" % (inBam.getrname(cur_chrom))
out_total.write(var_step)
out_barcodes.write(var_step)
for x in neg_dict.keys():
collapse_base(neg_dict[x], outBam, randomer, total_count, removed_count)
del neg_dict[x]
assert len(neg_dict) == 0
#this is kind of a hack, doesn't take into account negative strand barcodes, but as a general idea it works...
barcode_set = collapse_base(pos_list, outBam, randomer, total_count, removed_count)
barcode_counts = np.array(barcode_set.values())
barcode_probablity = barcode_counts / float(sum(barcode_counts))
entropy = -1 * sum(barcode_probablity * np.log2(barcode_probablity))
out_pos = prev_pos
out_entropy.write("\t".join(map(str, [out_pos, entropy])) + "\n")
out_total.write("\t".join(map(str, [out_pos, cur_count])) + "\n")
out_barcodes.write("\t".join(map(str, [out_pos, len(barcode_set)])) + "\n")
pos_list = []
cur_count = 0
if read.is_reverse:
try:
neg_dict[(cur_chrom, start)].append(read)
except KeyError as e:
neg_dict[(cur_chrom, start)] = [read]
else:
pos_list.append(read)
prev_pos = cur_pos
prev_chrom = cur_chrom
for x in neg_dict.keys():
collapse_base(neg_dict[x], outBam, randomer, total_count, removed_count)
del neg_dict[x]
collapse_base(pos_list, outBam, randomer, total_count, removed_count)
out_pos = prev_pos
out_total.write("\t".join(map(str, [out_pos, cur_count])) + "\n")
out_barcodes.write("\t".join(map(str, [out_pos, len(barcode_set)])) + "\n")
inBam.close()
outBam.close()
out_total.close()
out_barcodes.close()
return total_count, removed_count
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse")
parser.add_option("-c", "--randomer", action="store_true", dest="randomer", help="bam files are iclip barcoded")
parser.add_option("-o", "--out_file", dest="out_file")
parser.add_option("-m", "--metrics_file", dest="metrics_file")
(options,args) = parser.parse_args()
if not (options.bam.endswith(".bam")):
raise TypeError("%s, not bam file" % (options.bam))
pysam.index(options.bam)
total_count, removed_count = barcode_collapse(options.bam,
options.out_file,
options.randomer,
)
with open(options.metrics_file, 'w') as metrics:
metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n")
for barcode in total_count.keys():
metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n")
pysam.index(options.out_file)
sys.exit(0)
"""
Converts randomer + barcoded fastq files into something that can be barcode collapsed and mapped
"""
from collections import Counter
import gzip
import os
from optparse import OptionParser
def reformat_read(name, seq, plus, quality, barcodes,
RANDOMER_FRONT_LENGTH=3, RANDOMER_BACK_LENGTH=2):
""" reformats read to have correct barcode attached
name - read name
seq - read sequence
plus - +
quality - read quality sequence this is a poor mans datastructure, designed for speed
barcodes, dictionary of barcodes to search for
returns str - barcode barcode found, str - randomer identified, str - reformateed read
"""
barcode_lengths = {len(barcode) for barcode in barcodes.keys()}
barcode = None
#assumes larger barcodes are less likely, and searches for them first
#for each barcode length see if known barcodes appear
for barcode_length in sorted(barcode_lengths, reverse=True):
adapter_length = RANDOMER_FRONT_LENGTH + RANDOMER_BACK_LENGTH + barcode_length
randomer = seq[:adapter_length]
cur_barcode = randomer[RANDOMER_FRONT_LENGTH:adapter_length - RANDOMER_BACK_LENGTH]
if cur_barcode in barcodes:
barcode = cur_barcode
break
name = name[0] + randomer + ":" + name[1:]
seq = seq[adapter_length:]
quality = quality[adapter_length:]
#if none appear the barcode is unassigne
if barcode is None:
barcode = "unassigned"
result = name + seq + plus + quality
return barcode, randomer, result
if __name__ == "__main__":
usage = """ takes raw fastq files and demultiplex inline randomer + adapter sequences """
parser = OptionParser(usage)
parser.add_option("-f", "--fastq", dest="fastq", help="fastq file to barcode seperate")
parser.add_option("-b", "--barcodes", dest="barcodes", help="file of barcode / barcode id (tab sepearted, one barcode / barcode id on each line")
parser.add_option("-o", "--out_file", dest="out_file")
parser.add_option("-m", "--metrics_file", dest="metrics_file")
parser.add_option("--front", type=int, dest="front_length", help="Number of randomers before the barcode", default=3)
parser.add_option("--back", type=int, dest="back_length", help="Number of randomers after the barcode", default=2)
(options,args) = parser.parse_args()
#if a gziped file then we reassign open to be the gzip open and continue using that for the rest of the
#program
my_open = gzip.open if os.path.splitext(options.fastq)[1] == ".gz" else open
#creates different barcode files to assign reads to
RANDOMER_FRONT_LENGTH = options.front_length
RANDOMER_BACK_LENGTH = options.back_length
barcodes = {}
randomer_counts = {}
with open(options.barcodes) as barcodes_file:
for line in barcodes_file:
line = line.strip("\n").split("\t")
split_file = options.out_file.split(".")
split_file.insert(-1, line[1])
barcodes[line[0]] = open(".".join(split_file), 'w')
randomer_counts[line[0]] = Counter()
split_file = options.out_file.split(".")
split_file.insert(-1, "unassigned")
barcodes['unassigned'] = open(".".join(split_file), 'w')
randomer_counts['unassigned'] = Counter()
#reads through initial file parses everything out
with my_open(options.fastq) as fastq_file, open(options.metrics_file, 'w') as metrics_file:
while True:
try:
name = fastq_file.next()
seq = fastq_file.next()
fastq_file.next() #got to consume the read
plus = "+\n" #sometimes the descriptor is here, don't want it
quality = fastq_file.next()
barcode, randomer, result = reformat_read(name, seq, plus, quality, barcodes,
RANDOMER_FRONT_LENGTH, RANDOMER_BACK_LENGTH)
randomer_counts[barcode][randomer] += 1
barcodes[barcode].write(result)
except StopIteration:
break
for barcode, randomers in randomer_counts.items():
for randomer, count in randomers.items():
metrics_file.write("%s\t%s\t%s\n" % (barcode, randomer, count))
#cleans up at the end
for fn in barcodes.values():
fn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment