Skip to content

Instantly share code, notes, and snippets.

@dgrtwo
Created September 15, 2012 00:21
Show Gist options
  • Save dgrtwo/3725741 to your computer and use it in GitHub Desktop.
Save dgrtwo/3725741 to your computer and use it in GitHub Desktop.
Barcode splitter for fastq sequencing files that splits using Levenshtein distance.
"""
barcode_splitter.py
Barcode splitter for fastq sequencing files, that matches using Levenshtein
distance.
USAGE:
python barcode_splitter.py reads.fastq index_reads.fastq barcodes.txt
Takes three files: a raw reads file, a barcode index reads file, where each
barcode corresponds to a read in the raw file, and an catalog file in a
format similar to:
group1 TGACCA
group2 GCCAAT
group3 CAGATC
This program matches each index to the closest barcode, including mutations,
insertions, and deletions. It then splits the file according to those indices-
in this case, into group1.fastq, group2.fastq, and group3.fastq.
"""
import sys
import warnings
import itertools
import collections
from Bio import SeqIO
def levenshtein_distance(s1, s2):
"""
Python version of Levenshtein distance for compatability. The third-party
library is much faster and recommended. Taken from recipe at:
http://code.activestate.com/recipes/576874-levenshtein-distance/
"""
l1 = len(s1)
l2 = len(s2)
matrix = [range(l1 + 1)] * (l2 + 1)
for zz in xrange(l2 + 1):
matrix[zz] = range(zz, zz + l1 + 1)
for zz in range(0, l2):
for sz in range(0, l1):
if s1[sz] == s2[zz]:
matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1,
matrix[zz][sz + 1] + 1, matrix[zz][sz])
else:
matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1,
matrix[zz][sz + 1] + 1, matrix[zz][sz] + 1)
return matrix[l2][l1]
try:
import Levenshtein
levenshtein_distance = Levenshtein.distance
except ImportError:
warnings.warn("python-Levenshtein package not found. The package is " +
"recommended as it makes barcode splitting much faster. " +
"Using native Python Levenshtein distance function instead.")
class BarcodeIndex:
"""Represents a set of indices, with the ability to find the closest one"""
def __init__(self, index_file, max_distance=3):
self.cache = {}
with open(index_file) as inf:
for l in inf:
if l.startswith("#") or l == "":
continue
g, barcode = l.split("\t")
self.cache[barcode] = g
self.barcodes = self.cache.keys()
self.groups = self.cache.values()
self.max_distance = max_distance
def find_barcode(self, barcode):
"""
match barcode and return the group it's supposed to be in. If
there is none within the Levenshtein distance given, or if there
is a tie, return None
"""
# if there's an exact match, return that
exact = self.cache.get(barcode)
if exact is not None:
return exact
# find the Levenshtein distance to each
distances = [levenshtein_distance(barcode, b) for b in self.barcodes]
best = min(distances)
# check if there's a tie or the distance is too great:
if best > self.max_distance or distances.count(best) > 1:
return None
# otherwise, return the best one, after caching it for future use
ret = self.groups[distances.index(best)]
self.cache[barcode] = ret
return ret
def split_reads(read_file, barcode_file, index_file):
"""
Given a fastq file of reads, a fastq file with corresponding barcodes,
and a file with the barcode index. Create one fastq file for each group
based on the closest matching barcode
"""
index = BarcodeIndex(index_file)
counts = collections.defaultdict(int)
# one output for each group
outfs = dict([(g, open(g + ".fastq", "w"))
for g in index.groups + ["Unassigned"]])
with open(read_file) as read_inf, open(barcode_file) as barcode_inf:
for r, b in itertools.izip(SeqIO.parse(read_inf, "fastq"),
SeqIO.parse(barcode_inf, "fastq")):
group = index.find_barcode(str(b.seq))
group = group if group != None else "Unassigned"
SeqIO.write([r], outfs[group], "fastq")
counts[group] += 1
for k, v in sorted(counts.items(), key=lambda t: t[0] == "Unassigned"):
print k, v
for o in outfs.values():
o.close()
if __name__ == "__main__":
if len(sys.argv) != 4:
print "USAGE: python barcode_splitter.py reads.fastq index.fastq",
print "barcodes.txt"
sys.exit(1)
[read_file, barcode_file, index_file] = sys.argv[1:]
split_reads(read_file, barcode_file, index_file)
@peterjc
Copy link

peterjc commented Sep 6, 2013

The with statement isn't part of the language until Python 2.6, but can be enabled under Python 2.5 - what version are you using @piyuveng1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment