Skip to content

Instantly share code, notes, and snippets.

@dalexander
Last active January 3, 2016 13:19
Show Gist options
  • Save dalexander/8468885 to your computer and use it in GitHub Desktop.
Save dalexander/8468885 to your computer and use it in GitHub Desktop.
Find reads where the basecalling went haywire
#!/usr/bin/env python
from pbcore.io import BasH5Reader, M4Reader
from pbcore.util.Process import backticks
import sys, os.path as osp
def totalReadLength(m4r):
#return m4r.qseqlength
# qseqlength is bogus! use the offsets from the query string
extent = map(int, m4r.qName.split("/")[-1].split("_"))
return extent[1]-extent[0]
def holeNumber(m4r):
return int(m4r.qName.split("/")[1])
def mappedReadLength(m4r):
return (m4r.qEnd - m4r.qStart)
def mappedFraction(m4r):
# runway is max mappable read length for an aligner unaware of
# circular chromsomes.
if (m4r.tStrand == 1):
runway = m4r.tEnd
else:
runway = m4r.tLength-m4r.tStart
return float(mappedReadLength(m4r))/min(runway, totalReadLength(m4r))
def makeM4Alignment(basFname, refFname, outFname):
print "Running blasr..."
cmd = "blasr -bestn 1 -nproc 4 -m 4 %s %s -out %s" % \
(basFname, refFname, outFname)
_, retCode, errMsg = backticks(cmd)
if retCode != 0:
raise Exception(errMsg)
print "blasr completed."
if __name__=="__main__":
basFname = sys.argv[1]
refFname = sys.argv[2]
minReadLength = 20000
minMappedFraction = 0.5
if len(sys.argv) >= 4: minReadLength = sys.argv[3]
if len(sys.argv) >= 5: minMappedFraction = sys.argv[4]
m4filename = osp.splitext(osp.basename(basFname))[0] + ".m4"
makeM4Alignment(basFname, refFname, m4filename)
bas = BasH5Reader(basFname)
for record in M4Reader(m4filename):
zmw = bas[holeNumber(record)]
if (len(zmw.adapters) == 0 and
totalReadLength(record) >= minReadLength and
mappedFraction(record) < minMappedFraction):
print record.qName
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment