Skip to content

Instantly share code, notes, and snippets.

@ctb
Last active August 29, 2015 14:13
Show Gist options
  • Save ctb/789e905e1a758f1cff66 to your computer and use it in GitHub Desktop.
Save ctb/789e905e1a758f1cff66 to your computer and use it in GitHub Desktop.
*.bt2
*.sam
*.pos
*.fq
*.kh
*.kh.info
*.fai
*~
*.fq.gz
*-reads.fa
*.ct
*.ct.info
*.keep
*.pyc
Use with khmer branch test/ec_params.
#! /usr/bin/env python
"""
Compare two lists of error positions ('pos files').
first file is assumed to be prediction, second file is assumed to be "truth."
"""
import argparse
import sys
import screed
def do_counting(first, second, ignore_set):
n = 0 # reads w/errors in first
o = 0 # reads w/errors in second
matches = 0 # exact matches
mismatches = 0 #
n_ignored = 0 # intentionally ignored
for k, va in first.iteritems():
if k in ignore_set: # ignoreme
n_ignored += 1
continue
vb = second.get(k)
if not va and not vb: # no errors! TN.
continue
# these conditions are handled elsewhere
if va and not vb: # part of fp; see incorr_in_a intersect corr_in_b
continue
if vb and not va: # fn; see corr_in_a intersect incorr in b
continue
n += 1
if va == vb: # exact match!
matches += 1
else:
mismatches += 1 # no match :(
for k, vb in second.iteritems():
if k in ignore_set: # ignoreme
continue
if len(vb):
o += 1
return n, o, matches, mismatches, n_ignored
# read in list of error positions per read
def read_pos_file(filename, ignore_set):
for line in open(filename):
line = line.strip()
try:
read, posns = line.split(' ', 2)
except ValueError:
read = line
posns = []
if read in ignore_set:
posns = []
elif posns:
if posns is 'V': # ignore variable coverage foo
ignore_set.add(read)
posns = []
else:
posns = list(sorted(map(int, posns.split(','))))
yield read, posns
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-V', '--variable', default=False, action='store_true')
parser.add_argument('-d', '--debug', default=False, action='store_true')
parser.add_argument('pos_a')
parser.add_argument('pos_b')
parser.add_argument('reads')
args = parser.parse_args()
ignore_set = set() # reads to ignore because of low coverage
# read in two lists of positions & errors
print >>sys.stderr, 'loading posfiles'
a = dict(read_pos_file(args.pos_a, ignore_set))
b = dict(read_pos_file(args.pos_b, ignore_set))
if not args.variable: # eliminate ignore_set
ignore_set = set()
# get list of all reads
print >>sys.stderr, 'loading reads from', args.reads
all_names = set()
total_reads = 0
for record in screed.open(args.reads):
total_reads += 1
if record.name not in ignore_set:
all_names.add(record.name)
print >>sys.stderr, 'done!'
# reads we pay attention to
tracked_reads = len(all_names)
n_ignored = total_reads - tracked_reads
n, o, matches, mismatches, n_ignoredXX = do_counting(a, b, ignore_set)
print 'total # of reads: %d' % (total_reads,)
print 'IGNORED due to -V: %d (%d, %.2f%%)' % (n_ignored,
tracked_reads,
tracked_reads / float(total_reads) * 100.)
print 'total # of tracked reads:', tracked_reads
print '%d erroneous reads in %s' % (n, args.pos_a)
print '%d erroneous reads in %s' % (o, args.pos_b)
print '%d reads in common => all error positions AGREE' % (matches,)
print '%d DISAGREE' % (mismatches,)
# assume a is prediction, b is correct
incorrect_in_a = set([ k for k in a if a[k] ])
incorrect_in_b = set([ k for k in b if b[k] ])
correct_in_a = all_names - set([ k for k in a if a[k] ])
correct_in_b = all_names - set([ k for k in b if b[k] ])
if args.debug:
print 'DDD incorrect in a:', len(incorrect_in_a)
print 'DDD incorrect in b:', len(incorrect_in_b)
print 'DDD correct in a:', len(correct_in_a)
print 'DDD correct in b:', len(correct_in_b)
# m is reads thought to be erroneous by a, agree exactly with b
tp = matches
# fp: reads thought to be erroneous by a, but actually correct (in b)
# + places where a called errors that were incorrect
fp = mismatches + len(incorrect_in_a.intersection(correct_in_b))
if args.debug:
print 'DDD incorrect in a, correct in b:', \
incorrect_in_a.intersection(correct_in_b)
# fn: reads through to be correct by a, but actually erroneous (in b)
fn = len(correct_in_a.intersection(incorrect_in_b))
# tn: reads thought to be correct in both a and b
tn = len(correct_in_a.intersection(correct_in_b))
print 'TP:', tp
print 'TN:', tn
print 'FP: %d (%d + %d)' % (fp,
mismatches,
len(incorrect_in_a.intersection(correct_in_b)))
print 'FN:', fn
print 'sensitivity:', tp / float(tp + fn)
print 'specificity:', tp / float(tn + fp)
assert n <= total_reads
assert o <= total_reads
assert n_ignored == n_ignoredXX
assert len(all_names) == tp + tn + fp + fn, \
(len(all_names) - (tp+tn+fp+fn),
len(all_names),
tp,
tn,
fp,
fn)
if __name__ == '__main__':
main()
#! /usr/bin/env python
import sys
import argparse
import screed
import khmer
def main():
parser = argparse.ArgumentParser()
parser.add_argument('table')
parser.add_argument('ref')
args = parser.parse_args()
ct = khmer.load_counting_hash(args.table)
aligner = khmer.new_readaligner(ct, 5, 1.0)
for record in screed.open(args.ref):
s = record.sequence
s = s.replace('N', 'A')
score, graph_alignment, read_alignment, truncated = \
aligner.align(s)
assert not truncated
g = graph_alignment.replace('-', '')
r = read_alignment.replace('-', '')
print record.name
for kstart in range(0, len(g) - ct.ksize() + 1):
kmer = g[kstart:kstart+ct.ksize()]
print kstart, ct.get(kmer)
if __name__ == '__main__':
main()
/Users/t/Documents/papers/2014-streaming/pipeline/ecoliMG1655.fa
#! /usr/bin/env python
import screed
import argparse
def output_single(read):
if hasattr(read, 'accuracy'):
return "@%s\n%s\n+\n%s\n" % (read.name, read.sequence, read.accuracy)
else:
return ">%s\n%s\n" % (read.name, read.sequence)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('original')
parser.add_argument('quake')
parser.add_argument('output')
args = parser.parse_args()
read_info = {}
for record in screed.open(args.quake):
read_info[record.name] = len(record.sequence)
fp = open(args.output, 'w')
for record in screed.open(args.original):
if record.name in read_info:
length = read_info[record.name]
record.sequence = record.sequence[:length]
record.accuracy = record.accuracy[:length]
fp.write(output_single(record))
if __name__ == '__main__':
main()
#! /usr/bin/env python
import sys
import argparse
import screed
import khmer
from graphAlignment import align_long, AlignmentIndex
def main():
parser = argparse.ArgumentParser()
parser.add_argument('table')
parser.add_argument('ref')
args = parser.parse_args()
ct = khmer.load_counting_hash(args.table)
aligner = khmer.new_readaligner(ct, 5, 1.0)
for record in screed.open(args.ref):
seq = record.sequence
seq = seq.replace('N', 'A')
score, alignment = align_long(ct, aligner, seq)
g = alignment.g
r = alignment.r
m, n = alignment.compare()
print record.name, m, n, n - m, "%.3f%%" % (float(m)/ n * 100)
for start in range(0, len(alignment), 60):
print alignment[start:start+60]
if 0:
gidx = AlignmentIndex(alignment)
for gi, a, b in alignment.mismatches():
print gi, a, b, gidx.get_ri(gi)
if 0:
print len(seq), alignment.refseqlen()
gidx._sanityCheck(seq)
if __name__ == '__main__':
main()
#! /usr/bin/env python
import sys
import argparse
import screed
import khmer
def main():
parser = argparse.ArgumentParser()
parser.add_argument('table')
parser.add_argument('ref')
args = parser.parse_args()
ct = khmer.load_counting_hash(args.table)
aligner = khmer.new_readaligner(ct, 5, 1.0)
for record in screed.open(args.ref):
s = record.sequence
s = s.replace('N', 'A')
score, graph_alignment, read_alignment, truncated = \
aligner.align(s)
#assert not truncated
g = graph_alignment #.replace('-', '')
r = read_alignment #.replace('-', '')
line1 = []
line2 = []
line3 = []
for n, (a, b) in enumerate(zip(g, r)):
line1.append(a)
line3.append(b)
if a != b:
line2.append(' ')
else:
line2.append('|')
print '::', record.name, score, truncated
for start in range(0, len(line1), 60):
print "".join(line1[start:start+60])
print "".join(line2[start:start+60])
print "".join(line3[start:start+60])
print '--'
#print record.name, ct.find_spectral_error_positions(s, 10)
if __name__ == '__main__':
main()
#! /usr/bin/env python
import sys
import argparse
import screed
import khmer
def main():
parser = argparse.ArgumentParser()
parser.add_argument('table')
parser.add_argument('ref')
args = parser.parse_args()
ct = khmer.load_counting_hash(args.table)
for record in screed.open(args.ref):
s = record.sequence
s = s.replace('N', 'A')
for i in range(0, len(s) - ct.ksize() + 1):
kmer = s[i:i+ct.ksize()]
print i, ct.get(kmer)
print record.name, ct.find_spectral_error_positions(s, 10)
if __name__ == '__main__':
main()
import sys
import string
import array
import khmer
import screed
__complementTranslation = string.maketrans('ACTGactg-=Nn', 'TGACtgac-=Nn')
REGIONSIZE = 50
def reverse_complement(s):
"""
Build reverse complement of 's', alignment-aware.
"""
r = array.array('c', s)
r.reverse()
r = string.join(r, '')
c = string.translate(r, __complementTranslation)
return c
class GraphAlignment(object):
def __init__(self, g, r):
assert len(g) == len(r), (g, len(g), r, len(r))
self.g = g.upper()
self.r = r.upper()
def reverse_complement(self):
return GraphAlignment(reverse_complement(self.g),
reverse_complement(self.r))
rc = reverse_complement
def __add__(self, other):
return GraphAlignment(self.g + other.g, self.r + other.r)
def __len__(self):
return len(self.g)
def refseqlen(self):
r = self.r
return r.count('G') + r.count('C') + r.count('T') + r.count('A')
def __getitem__(self, i):
if isinstance(i, slice):
start, stop, step = i.indices(len(self.g))
assert step == 1
return GraphAlignment(self.g[start:stop], self.r[start:stop])
return (self.g[i], self.r[i])
def compare(self):
n = len(self.g)
m = n
for n, a, b in self.mismatches():
m -= 1
return m, n
def mismatches(self):
for n, (a, b) in enumerate(zip(self.g, self.r)):
if (a in '-=' or b in '-=' or a != b):
yield n, a, b
def __str__(self):
line1 = []
line2 = []
line3 = []
for (a, b) in zip(self.g, self.r):
line1.append(a)
line3.append(b)
if a in '-=' or b in '-=' or a != b:
line2.append(' ')
else:
line2.append('|')
return "%s\n%s\n%s\n" % ("".join(line1), "".join(line2), "".join(line3))
def make_gap(r):
return GraphAlignment('='*len(r), r)
def _index_alignment(galign, freq=100):
g_to_r = {}
r_to_g = {}
si = 0
for gi, (a, b) in enumerate(zip(galign.g, galign.r)):
if gi % freq == 0:
g_to_r[gi] = si
if si % freq == 0:
r_to_g[si] = gi
if b in 'ACGT':
si += 1
return g_to_r, r_to_g
class AlignmentIndex(object):
def __init__(self, galign, freq=100):
self.g_to_r, self.r_to_g = _index_alignment(galign, int(freq))
self.galign = galign
self.freq = int(freq)
def get_gi(self, ri):
"Return alignment coordinates cooresponding to reference seq coords."
rpost = int(ri / self.freq) * self.freq
gpost = self.r_to_g[rpost]
diff = ri - rpost
gi = gpost
while diff > 0:
(a, b) = self.galign[gi]
if b in 'ACGT':
diff -= 1
gi += 1
# make sure it's on a valid letter ;)
while 1:
(a, b) = self.galign[gi]
if b in 'ACGT':
break
gi += 1
return gi
def get_ri(self, gi):
"Return reference sequence coordinates from alignment coordinates."
gpost = int(gi / self.freq) * self.freq
ri = self.g_to_r[gpost]
diff = gi - gpost
while diff > 0:
(a, b) = self.galign[gpost]
diff -= 1
gpost += 1
if b in 'ACGT':
ri += 1
return ri
def _sanityCheck(self, seq):
alignment = self.galign
for gi in range(len(alignment)):
if alignment[gi][1] in 'ACGT':
ri = self.get_ri(gi)
assert alignment[gi][1] == seq[ri]
for ri in range(len(seq)):
gi = self.get_gi(ri)
assert seq[ri] == alignment[gi][1]
def stitch(galign, K):
"""Stitch together multiple alignments, assuming all overlaps are
off-by-one on the right end."""
ga = []
ra = []
len_so_far = 0
n = 0
for gg in galign[:-1]:
g = gg.g
r = gg.r
assert len(g) == len(r)
ga.append(g[:-K + 1])
ra.append(r[:-K + 1])
n += 1
ga.append(galign[-1].g)
ra.append(galign[-1].r)
return GraphAlignment("".join(ga), "".join(ra))
def align_segment_right(aligner, seq, next_ch=None):
score, g, r, truncated = aligner.align(seq)
galign = GraphAlignment(g, r)
# did it fail to align across entire segment?
if truncated:
aligned_length = galign.refseqlen()
unaligned_len = len(seq) - aligned_length
# if we are given next ch, try aligning backwards from next seed
if next_ch:
unaligned_seq = seq[-unaligned_len:]
sr, ggr = align_segment_left(aligner, unaligned_seq + next_ch)
galign += ggr[:-1]
score += sr # need to adjust score down... @@
else:
# if not, just build a gap...
galign += make_gap(seq[-unaligned_len:])
assert galign.refseqlen() == len(seq)
return score, galign
def align_segment_left(aligner, seq):
seq_rc = reverse_complement(seq)
score, galign = align_segment_right(aligner, seq_rc)
return score, galign.rc()
def align_long(ct, aligner, sta):
K = ct.ksize()
# first, pick seeds for each chunk
seeds = []
for start in range(0, len(sta), REGIONSIZE):
region = sta[start:start + REGIONSIZE + K - 1]
if len(region) < K:
assert len(sta) - start < REGIONSIZE
break
seed_pos = find_highest_abund_kmer(ct, region)
seeds.append(start + seed_pos)
assert len(seeds) == len(set(seeds))
# then, break between-seed intervals down into regions, starting at
# first seed.
region_coords = []
for i in range(len(seeds) - 1):
seed_pos = seeds[i]
end_seed = seeds[i + 1] - 1
region_coords.append((seed_pos, end_seed + K))
# account for situation where last region is too small to align.
if len(sta) - seeds[-1] > K:
region_coords.append((seeds[-1], len(sta)))
else:
(last_seed, _) = region_coords.pop()
region_coords.append((last_seed, len(sta)))
# start building piecewise alignments, anchored by seeds.
alignments = []
scores = []
n = 0
for (start, end) in region_coords[:-1]:
score, galign = align_segment_right(aligner, sta[start:end],
next_ch=sta[end])
scores.append(score)
alignments.append(galign)
n += 1
# deal with end (no possibility of alignment from right)
(start, end) = region_coords[-1]
score, galign = align_segment_right(aligner, sta[start:end])
alignments.append(galign)
scores.append(score)
# deal with beginning, too: reverse align from first seed.
leftend = sta[0:seeds[0] + K]
score, galign = align_segment_left(aligner, leftend)
galign = galign[:-1] # trim off seed k-mer
alignments.insert(0, galign)
scores.insert(0, score)
# stitch all the alignments together
final = stitch(alignments, K)
return sum(scores), final
def find_highest_abund_kmer(ct, r):
pos = 0
abund = ct.get(r[:ct.ksize()])
for kstart in range(1, len(r) - ct.ksize() + 1):
kmer = r[kstart:kstart + ct.ksize()]
count = ct.get(kmer)
if count > abund:
pos = kstart
return pos
def test_1():
ct = khmer.new_counting_hash(20, 1.1e6, 4)
ct.consume_fasta('simple-haplo-reads.fa.keep')
#ct = khmer.load_counting_hash('simple-haplo-reads.dn.ct')
aligner = khmer.new_readaligner(ct, 5, 1.0)
seq = "".join("""GTCCTGGCGGTCCCCATTCA
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
GTAGGAAGTCACATGTGCTAAATATCAG
TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
GCACAGTTTTTTATACGAAACCGTCTGCGTCAAGACGGGCCACATGGT
""".strip().split())
score, alignment = align_long(ct, aligner, seq)
print len(seq), alignment.refseqlen()
for start in range(0, len(alignment), 60):
print alignment[start:start+60]
gidx = AlignmentIndex(alignment)
gidx._sanityCheck(seq)
def test_2():
ct = khmer.new_counting_hash(20, 1.1e6, 4)
ct.consume_fasta('simple-haplo-reads.fa.keep')
aligner = khmer.new_readaligner(ct, 5, 1.0)
seq = "".join("""GTCCTGGCGGTCCCCATTCA
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
GTAGGAAGTCACATGTGCTAAATATCAG
TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTAT
""".strip().split())
seq = list(screed.open('simplefoo.fa'))[0].sequence
score, alignment = align_long(ct, aligner, seq)
print len(seq), alignment.refseqlen()
for start in range(0, len(alignment), 60):
print alignment[start:start+60]
gidx = AlignmentIndex(alignment)
gidx._sanityCheck(seq)
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# evaluate against perfect?
QUAKE=/Users/t/Documents/papers/2014-streaming/Quake
all: ref-errors.details.txt corr-errors.details.txt
display: compare.txt refcompare.txt
ecoli-subset.fq: ecoli-mapped.fq.gz
sample-reads-randomly.py -R 1 -N 10000 ecoli-mapped.fq.gz -o ecoli-subset.fq
ecoli-subset.100k.fq: ecoli-mapped.fq.gz
sample-reads-randomly.py -R 2 -N 100000 ecoli-mapped.fq.gz -o ecoli-subset.100k.fq
ecoli-subset.100k.cor.fq.gz: ecoli-subset.100k.fq
echo ecoli-subset.100k.fq > ecoli_quake_list.txt
#ln -fs /usr/local/bin/jellyfish .
#cat ecoli-mapped.fq.gz.keep | $(QUAKE)/bin/count-qmers -q 33 -k 14 > ecoli_dn_counts.out
#$(QUAKE)/bin/cov_model.py ecoli_dn_counts.out > ecoli_dn_counts.cov
$(QUAKE)/bin/correct -f ecoli_quake_list.txt -p 4 -k 14 -q 33 -c 7.94 -z -m ecoli_dn_counts.out
#mv ecoli-mapped.cor.fq.gz ecoli-mapped.dn.cor.fq.gz
ecoli-subset.100k.quake.fq: ecoli-subset.100k.cor.fq.gz ecoli-subset.100k.fq
extract-original-reads-from-quake-cor.py ecoli-subset.100k.fq ecoli-subset.100k.cor.fq.gz ecoli-subset.100k.quake.fq
original.sam: ecoli-subset.fq
cat ecoli-subset.fq | bowtie2 -p 4 -x ecoli -U - -S original.sam
original.sam.pos: original.sam
./sam-scan.py ecoliMG1655.fa original.sam -o original.sam.pos
###
ecoli.dn.k13.kh: ecoli-mapped.fq.gz.keep.gz
load-into-counting.py -k 13 -x 4e7 ecoli.dn.k13.kh ecoli-mapped.fq.gz.keep.gz
corr.k13.C17.fq: ecoli.dn.k13.kh ecoli-subset.fq
../sandbox/error-correct-pass2.py ecoli.dn.k13.kh ecoli-subset.fq -o corr.k13.C17.fq --trusted-cov 17
corr.k13.C17.sam: corr.k13.C17.fq
cat corr.k13.C17.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k13.C17.sam
corr.k13.C17.sam.pos: corr.k13.C17.sam
./sam-scan.py ecoliMG1655.fa corr.k13.C17.sam -o corr.k13.C17.sam.pos
###
ecoli.dn.k15.kh: ecoli-mapped.fq.gz.keep.gz
load-into-counting.py -k 15 -x 4e7 ecoli.dn.k15.kh ecoli-mapped.fq.gz.keep.gz
corr.k15.C15.fq: ecoli.dn.k15.kh ecoli-subset.fq
../sandbox/error-correct-pass2.py ecoli.dn.k15.kh ecoli-subset.fq -o corr.k15.C15.fq --trusted-cov 15
corr.k15.C15.sam: corr.k15.C15.fq
cat corr.k15.C15.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k15.C15.sam
corr.k15.C15.sam.pos: corr.k15.C15.sam
./sam-scan.py ecoliMG1655.fa corr.k15.C15.sam -o corr.k15.C15.sam.pos
###
ecoli.dn.k17.kh: ecoli-mapped.fq.gz.keep.gz
load-into-counting.py -k 17 -x 4e7 ecoli.dn.k17.kh ecoli-mapped.fq.gz.keep.gz
corr.k17.C15.fq: ecoli.dn.k17.kh ecoli-subset.fq
../sandbox/error-correct-pass2.py ecoli.dn.k17.kh ecoli-subset.fq -o corr.k17.C15.fq --trusted-cov 15
corr.k17.C15.sam: corr.k17.C15.fq
cat corr.k17.C15.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k17.C15.sam
corr.k17.C15.sam.pos: corr.k17.C15.sam
./sam-scan.py ecoliMG1655.fa corr.k17.C15.sam -o corr.k17.C15.sam.pos
###
ecoli.dn.k21.kh: ecoli-mapped.fq.gz.keep.gz
load-into-counting.py -k 21 -x 8e7 ecoli.dn.k21.kh ecoli-mapped.fq.gz.keep.gz
ecoli.dn.k23.kh: ecoli-mapped.fq.gz.keep.gz
load-into-counting.py -k 23 -x 8e7 ecoli.dn.k23.kh ecoli-mapped.fq.gz.keep.gz
ecoli.dn.k31.kh: ecoli-mapped.fq.gz.keep.gz
load-into-counting.py -k 31 -x 8e7 ecoli.dn.k31.kh ecoli-mapped.fq.gz.keep.gz
corr.k21.C5.fq: ecoli.dn.k21.kh ecoli-subset.fq
../sandbox/error-correct-pass2.py ecoli.dn.k21.kh ecoli-subset.fq -o corr.k21.C5.fq --trusted-cov 5
corr.k21.C5.sam: corr.k21.C5.fq
cat corr.k21.C5.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k21.C5.sam
corr.k21.C5.sam.pos: corr.k21.C5.sam
./sam-scan.py ecoliMG1655.fa corr.k21.C5.sam -o corr.k21.C5.sam.pos
ecoli-ref.dn.k21.kh: ecoliMG1655.fa
load-into-counting.py -k 21 -x 1e7 ecoli-ref.dn.k21.kh ecoliMG1655.fa
refcorr.k21.fq: ecoli-ref.dn.k21.kh
../sandbox/error-correct-pass2.py ecoli-ref.dn.k21.kh ecoli-subset.fq -o refcorr.k21.fq --trusted-cov=1
refcorr.k21.sam: refcorr.k21.fq
cat refcorr.k21.fq | bowtie2 -p 4 -x ecoli -U - -S refcorr.k21.sam
refcorr.k21.sam.pos: refcorr.k21.sam
./sam-scan.py ecoliMG1655.fa refcorr.k21.sam -o refcorr.k21.sam.pos
###
ref-errors.sam: ref-errors.fq
bowtie2 -p 4 -x ecoli -U ref-errors.fq -S ref-errors.sam
ref-errors.details.txt: ref-errors.sam
./sam-scan-details.py ecoliMG1655.fa ref-errors.sam ecoli-ref.dn.k21.kh -o ref-errors.details.txt -C 1
corr-errors.sam: corr-errors.fq
bowtie2 -p 4 -x ecoli -U corr-errors.fq -S corr-errors.sam
corr-errors.details.txt: corr-errors.sam
./sam-scan-details.py ecoliMG1655.fa corr-errors.sam ecoli.dn.k21.kh -o corr-errors.details.txt -C 5
###
test-set.sam: test-set.fq
bowtie2 -p 4 -x ecoli -U test-set.fq -S test-set.sam
test-set.sam.pos: test-set.sam
./sam-scan.py ecoliMG1655.fa test-set.sam -o test-set.sam.pos
test-set-report: test-set.sam.pos
./summarize-pos-file.py test-set.sam.pos test-set.fq
###
compare.txt: original.sam.pos corr.k13.C17.sam.pos corr.k15.C15.sam.pos \
corr.k17.C15.sam.pos corr.k21.C5.sam.pos
./summarize-pos-file.py original.sam.pos ecoli-subset.fq
./summarize-pos-file.py corr.k13.C17.sam.pos corr.k13.C17.fq
./summarize-pos-file.py corr.k15.C15.sam.pos corr.k15.C15.fq
./summarize-pos-file.py corr.k17.C15.sam.pos corr.k17.C15.fq
./summarize-pos-file.py corr.k21.C5.sam.pos corr.k21.C5.fq --save-erroneous-to=corr-errors.fq
refcompare.txt: refcorr.k21.sam.pos
./summarize-pos-file.py original.sam.pos ecoli-subset.fq
./summarize-pos-file.py refcorr.k21.sam.pos refcorr.k21.fq --save-erroneous-to=ref-errors.fq
ecoli.1.bt2: ecoliMG1655.fa
bowtie2-build ecoliMG1655.fa ecoli
samtools faidx ecoliMG1655.fa
#####
NULLGRAPH=~/dev/nullgraph
simple-haplo-reads.fa: simple-haplo.fa
$(NULLGRAPH)/make-reads.py -C 100 -S 1 simple-haplo.fa > simple-haplo-reads.fa
simple-reads.fa: simple.fa
$(NULLGRAPH)/make-biased-reads.py -S 1 -C 100 simple.fa > simple-reads.fa
simple-reads.ct: simple-reads.fa
load-into-counting.py -k 20 -x 1.1e6 simple-reads.ct simple-reads.fa
simple-haplo-reads.dn.ct: simple-haplo-reads.fa
normalize-by-median.py -k 20 -C 20 simple-haplo-reads.fa -s simple-haplo-reads.dn.ct -x 1.1e6
ex1: simple-haplo-reads.dn.ct
find-variant-by-align.py simple-haplo-reads.dn.ct simple-alt.fa
ex2: simple-reads.ct
count-by-align.py simple-reads.ct simple-haplo.fa > simple-haplo.counts
count-by-align.py simple-reads.ct simple-alt.fa > simple-alt.counts
ex3:simple-haplo-reads.dn.ct
find-variant-by-align.py simple-haplo-reads.dn.ct simple-long.fa
ex4:simple-haplo-reads.dn.ct
find-variant-by-align-long.py simple-haplo-reads.dn.ct simple-haplo.fa
ex5:simple-haplo-reads.dn.ct
find-variant-by-align-long.py simple-haplo-reads.dn.ct simple-long.fa
#! /usr/bin/env python
import sys
import argparse
import screed
import math
import khmer
def ignore_at(iter):
for item in iter:
if item.startswith('@'):
continue
yield item
def main():
parser = argparse.ArgumentParser()
parser.add_argument('genome')
parser.add_argument('samfile')
parser.add_argument('counts_table')
parser.add_argument('-o', '--outfile', type=argparse.FileType('w'),
default=sys.stdout)
parser.add_argument('-C', '--cutoff', type=int, default=2)
args = parser.parse_args()
kh = khmer.load_counting_hash(args.counts_table)
genome_dict = dict([ (record.name, record.sequence) for record in \
screed.open(args.genome) ])
n = 0
n_skipped = 0
n_rev = n_fwd = 0
for samline in ignore_at(open(args.samfile)):
n += 1
if n % 100000 == 0:
print >>sys.stderr, '...', n
readname, flags, refname, refpos, _, _, _, _, _, seq = \
samline.split('\t')[:10]
if refname == '*' or refpos == '*':
# (don't count these as skipped)
continue
refpos = int(refpos)
try:
ref = genome_dict[refname][refpos-1:refpos+len(seq) - 1]
except KeyError:
print >>sys.stderr, "unknown refname: %s; ignoring (read %s)" % (refname, readname)
n_skipped += 1
continue
errors = []
out1 = []
out2 = []
out3 = []
out4 = []
out5 = []
for pos, (a, b) in enumerate(zip(ref, seq)):
if a != b:
# see http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output - '16' is the revcomp flag.
if int(flags) & 16:
pos = len(seq) - pos - 1
n_rev += 1
else:
n_fwd += 1
errors.append(pos)
out1.append(a)
out2.append(b)
if a != b:
out3.append('x')
else:
out3.append(' ')
kmer = ref[pos:pos+kh.ksize()]
if len(kmer) == kh.ksize() and kh.get(kmer) < args.cutoff:
out4.append('|')
else:
out4.append(' ')
kmer = seq[pos:pos+kh.ksize()]
if len(kmer) == kh.ksize() and kh.get(kmer) < args.cutoff:
if kh.get(kmer) == 0:
out5.append('0')
else:
out5.append('<')
else:
out5.append(' ')
for start in range(0, len(out1), 60):
if start == 0:
print >>args.outfile, '%s positions %s:%s (len %d)' % \
(readname, start, min(start+60, len(out1)), len(out1))
else:
print >>args.outfile, '%s:%s (len %d)' % \
(start, min(start+60, len(out1)), len(out1))
print >>args.outfile, ' ref:', "".join(out1[start:start+60])
print >>args.outfile, ' read:', "".join(out2[start:start+60])
print >>args.outfile, ' mismatch:', "".join(out3[start:start+60])
print >>args.outfile, ' ref kmer:', "".join(out4[start:start+60])
print >>args.outfile, 'read kmer:', "".join(out5[start:start+60])
print >>args.outfile, ''
print >>args.outfile, '-------'
if __name__ == '__main__':
main()
#! /usr/bin/env python
import sys
import argparse
import screed
import math
def ignore_at(iter):
for item in iter:
if item.startswith('@'):
continue
yield item
def main():
parser = argparse.ArgumentParser()
parser.add_argument('genome')
parser.add_argument('samfile')
parser.add_argument('-o', '--outfile', type=argparse.FileType('w'),
default=sys.stdout)
args = parser.parse_args()
genome_dict = dict([ (record.name, record.sequence) for record in \
screed.open(args.genome) ])
n = 0
n_skipped = 0
n_rev = n_fwd = 0
for samline in ignore_at(open(args.samfile)):
n += 1
if n % 100000 == 0:
print >>sys.stderr, '...', n
readname, flags, refname, refpos, _, _, _, _, _, seq = \
samline.split('\t')[:10]
if refname == '*' or refpos == '*':
# (don't count these as skipped)
continue
refpos = int(refpos)
try:
ref = genome_dict[refname][refpos-1:refpos+len(seq) - 1]
except KeyError:
print >>sys.stderr, "unknown refname: %s; ignoring (read %s)" % (refname, readname)
n_skipped += 1
continue
errors = []
for pos, (a, b) in enumerate(zip(ref, seq)):
if a != b:
# see http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output - '16' is the revcomp flag.
if int(flags) & 16:
pos = len(seq) - pos - 1
n_rev += 1
else:
n_fwd += 1
errors.append(pos)
print >>args.outfile, readname, ",".join(map(str, errors))
# avoid log errors via pseudocount
n_fwd += 1
n_rev += 1
print >>sys.stderr, 'logratio of fwd to rev: %.2f' % (math.log(n_fwd / float(n_rev), 2))
if n_skipped / float(n) > .01:
raise Exception, "Error: too many reads ignored! %d of %d" % \
(n_skipped, n)
if __name__ == '__main__':
main()
>genome2
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
>genome1
CTGCCATTGCCCCAACATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTCATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
>genome3
GTCCTGGCGGTCCCCATTCA
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
GTAGGAAGTCACATGTGCTAAATATCAG
TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
GCACAGTTTTTTATACGAAACCGTCTGCGTCAAGACGGGCCACATGGT
>genome 2
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
>genome2 1.5
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTCATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
#! /usr/bin/env python
import argparse
import screed
def output_single(read):
name = read.name
sequence = read.sequence
accuracy = None
if hasattr(read, 'accuracy'):
accuracy = read.accuracy
assert len(sequence) == len(accuracy), (sequence, accuracy)
return "@%s\n%s\n+\n%s\n" % (name, sequence, accuracy)
else:
return ">%s\n%s\n" % (name, sequence)
# read in list of error positions per read
def read_pos_file(filename, ignore_set=set()):
for line in open(filename):
line = line.strip()
try:
read, posns = line.split(' ', 2)
except ValueError:
read = line
posns = []
if read in ignore_set:
posns = []
elif posns:
if posns is 'V': # ignore variable coverage foo
ignore_set.add(read)
posns = []
else:
posns = list(sorted(map(int, posns.split(','))))
yield read, posns
def main():
parser = argparse.ArgumentParser()
parser.add_argument('posfile')
parser.add_argument('readsfile')
args = parser.parse_args()
posdict = dict(read_pos_file(args.posfile))
out_1err = open(args.readsfile + '.err1', 'w')
out_2err = open(args.readsfile + '.err2', 'w')
out_3err = open(args.readsfile + '.err3', 'w')
out_4err = open(args.readsfile + '.err4', 'w')
for n, read in enumerate(screed.open(args.readsfile)):
errors = posdict.get(read.name, [])
if errors:
if len(errors) == 1:
out_1err.write(output_single(read))
elif len(errors) == 2:
out_2err.write(output_single(read))
elif len(errors) == 3:
out_3err.write(output_single(read))
elif len(errors) >= 4:
out_4err.write(output_single(read))
if __name__ == '__main__':
main()
#! /usr/bin/env python
import sys
import argparse
import screed
def output_single(read):
name = read.name
sequence = read.sequence
accuracy = None
if hasattr(read, 'accuracy'):
accuracy = read.accuracy
assert len(sequence) == len(accuracy), (sequence, accuracy)
return "@%s\n%s\n+\n%s\n" % (name, sequence, accuracy)
else:
return ">%s\n%s\n" % (name, sequence)
def read_pos_file(filename):
for line in open(filename):
line = line.strip()
try:
read, posns = line.split(' ', 2)
posns = map(int, posns.split(','))
except ValueError:
read = line
posns = []
yield read, posns
def main():
parser = argparse.ArgumentParser()
parser.add_argument('posfile')
parser.add_argument('reads')
parser.add_argument('--limitreads', default=None)
parser.add_argument('--save-erroneous-to', dest='save_err_to',
help="output erroneous reads to this file",
type=argparse.FileType('w'), default=None)
args = parser.parse_args()
print 'reading files...', args.posfile, args.reads
posdict = dict(read_pos_file(args.posfile))
limitnames = None
if args.limitreads:
limitnames = set([ readname for readname, _ in \
read_pos_file(args.limitreads) ])
all_reads = 0
sum_bp = 0
print 'reading sequences...'
for n, record in enumerate(screed.open(args.reads)):
if n % 100000 == 0:
print >>sys.stderr, '...', n
if args.limitreads and record.name not in limitnames:
continue
all_reads += 1
sum_bp += len(record.sequence)
print 'done!'
n_reads = 0
n = 0
m = 0
skipped = 0
for k, v in posdict.iteritems():
if args.limitreads and k not in limitnames:
skipped += 1
continue
n_reads += 1
if not v:
continue
n += 1
m += len(v)
if args.save_err_to:
for nn, record in enumerate(screed.open(args.reads)):
if nn % 100000 == 0:
print >>sys.stderr, '... x save', nn
if posdict.get(record.name):
args.save_err_to.write(output_single(record))
print 'XXX', all_reads, n_reads
print 'posfile %s: %d mutated reads of %d; %d mutations total' % \
(args.posfile, n, n_reads, m)
print 'skipped:', skipped
print '%d bp total' % (sum_bp,)
print 'overall error rate: %f%%' % (100. * m / float(sum_bp))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment