Last active
August 29, 2015 14:13
-
-
Save ctb/789e905e1a758f1cff66 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.bt2 | |
*.sam | |
*.pos | |
*.fq | |
*.kh | |
*.kh.info | |
*.fai | |
*~ | |
*.fq.gz | |
*-reads.fa | |
*.ct | |
*.ct.info | |
*.keep | |
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Use with khmer branch test/ec_params. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
""" | |
Compare two lists of error positions ('pos files'). | |
first file is assumed to be prediction, second file is assumed to be "truth." | |
""" | |
import argparse | |
import sys | |
import screed | |
def do_counting(first, second, ignore_set): | |
n = 0 # reads w/errors in first | |
o = 0 # reads w/errors in second | |
matches = 0 # exact matches | |
mismatches = 0 # | |
n_ignored = 0 # intentionally ignored | |
for k, va in first.iteritems(): | |
if k in ignore_set: # ignoreme | |
n_ignored += 1 | |
continue | |
vb = second.get(k) | |
if not va and not vb: # no errors! TN. | |
continue | |
# these conditions are handled elsewhere | |
if va and not vb: # part of fp; see incorr_in_a intersect corr_in_b | |
continue | |
if vb and not va: # fn; see corr_in_a intersect incorr in b | |
continue | |
n += 1 | |
if va == vb: # exact match! | |
matches += 1 | |
else: | |
mismatches += 1 # no match :( | |
for k, vb in second.iteritems(): | |
if k in ignore_set: # ignoreme | |
continue | |
if len(vb): | |
o += 1 | |
return n, o, matches, mismatches, n_ignored | |
# read in list of error positions per read | |
def read_pos_file(filename, ignore_set): | |
for line in open(filename): | |
line = line.strip() | |
try: | |
read, posns = line.split(' ', 2) | |
except ValueError: | |
read = line | |
posns = [] | |
if read in ignore_set: | |
posns = [] | |
elif posns: | |
if posns is 'V': # ignore variable coverage foo | |
ignore_set.add(read) | |
posns = [] | |
else: | |
posns = list(sorted(map(int, posns.split(',')))) | |
yield read, posns | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-V', '--variable', default=False, action='store_true') | |
parser.add_argument('-d', '--debug', default=False, action='store_true') | |
parser.add_argument('pos_a') | |
parser.add_argument('pos_b') | |
parser.add_argument('reads') | |
args = parser.parse_args() | |
ignore_set = set() # reads to ignore because of low coverage | |
# read in two lists of positions & errors | |
print >>sys.stderr, 'loading posfiles' | |
a = dict(read_pos_file(args.pos_a, ignore_set)) | |
b = dict(read_pos_file(args.pos_b, ignore_set)) | |
if not args.variable: # eliminate ignore_set | |
ignore_set = set() | |
# get list of all reads | |
print >>sys.stderr, 'loading reads from', args.reads | |
all_names = set() | |
total_reads = 0 | |
for record in screed.open(args.reads): | |
total_reads += 1 | |
if record.name not in ignore_set: | |
all_names.add(record.name) | |
print >>sys.stderr, 'done!' | |
# reads we pay attention to | |
tracked_reads = len(all_names) | |
n_ignored = total_reads - tracked_reads | |
n, o, matches, mismatches, n_ignoredXX = do_counting(a, b, ignore_set) | |
print 'total # of reads: %d' % (total_reads,) | |
print 'IGNORED due to -V: %d (%d, %.2f%%)' % (n_ignored, | |
tracked_reads, | |
tracked_reads / float(total_reads) * 100.) | |
print 'total # of tracked reads:', tracked_reads | |
print '%d erroneous reads in %s' % (n, args.pos_a) | |
print '%d erroneous reads in %s' % (o, args.pos_b) | |
print '%d reads in common => all error positions AGREE' % (matches,) | |
print '%d DISAGREE' % (mismatches,) | |
# assume a is prediction, b is correct | |
incorrect_in_a = set([ k for k in a if a[k] ]) | |
incorrect_in_b = set([ k for k in b if b[k] ]) | |
correct_in_a = all_names - set([ k for k in a if a[k] ]) | |
correct_in_b = all_names - set([ k for k in b if b[k] ]) | |
if args.debug: | |
print 'DDD incorrect in a:', len(incorrect_in_a) | |
print 'DDD incorrect in b:', len(incorrect_in_b) | |
print 'DDD correct in a:', len(correct_in_a) | |
print 'DDD correct in b:', len(correct_in_b) | |
# m is reads thought to be erroneous by a, agree exactly with b | |
tp = matches | |
# fp: reads thought to be erroneous by a, but actually correct (in b) | |
# + places where a called errors that were incorrect | |
fp = mismatches + len(incorrect_in_a.intersection(correct_in_b)) | |
if args.debug: | |
print 'DDD incorrect in a, correct in b:', \ | |
incorrect_in_a.intersection(correct_in_b) | |
# fn: reads through to be correct by a, but actually erroneous (in b) | |
fn = len(correct_in_a.intersection(incorrect_in_b)) | |
# tn: reads thought to be correct in both a and b | |
tn = len(correct_in_a.intersection(correct_in_b)) | |
print 'TP:', tp | |
print 'TN:', tn | |
print 'FP: %d (%d + %d)' % (fp, | |
mismatches, | |
len(incorrect_in_a.intersection(correct_in_b))) | |
print 'FN:', fn | |
print 'sensitivity:', tp / float(tp + fn) | |
print 'specificity:', tp / float(tn + fp) | |
assert n <= total_reads | |
assert o <= total_reads | |
assert n_ignored == n_ignoredXX | |
assert len(all_names) == tp + tn + fp + fn, \ | |
(len(all_names) - (tp+tn+fp+fn), | |
len(all_names), | |
tp, | |
tn, | |
fp, | |
fn) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import argparse | |
import screed | |
import khmer | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('table') | |
parser.add_argument('ref') | |
args = parser.parse_args() | |
ct = khmer.load_counting_hash(args.table) | |
aligner = khmer.new_readaligner(ct, 5, 1.0) | |
for record in screed.open(args.ref): | |
s = record.sequence | |
s = s.replace('N', 'A') | |
score, graph_alignment, read_alignment, truncated = \ | |
aligner.align(s) | |
assert not truncated | |
g = graph_alignment.replace('-', '') | |
r = read_alignment.replace('-', '') | |
print record.name | |
for kstart in range(0, len(g) - ct.ksize() + 1): | |
kmer = g[kstart:kstart+ct.ksize()] | |
print kstart, ct.get(kmer) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/Users/t/Documents/papers/2014-streaming/pipeline/ecoliMG1655.fa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import screed | |
import argparse | |
def output_single(read): | |
if hasattr(read, 'accuracy'): | |
return "@%s\n%s\n+\n%s\n" % (read.name, read.sequence, read.accuracy) | |
else: | |
return ">%s\n%s\n" % (read.name, read.sequence) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('original') | |
parser.add_argument('quake') | |
parser.add_argument('output') | |
args = parser.parse_args() | |
read_info = {} | |
for record in screed.open(args.quake): | |
read_info[record.name] = len(record.sequence) | |
fp = open(args.output, 'w') | |
for record in screed.open(args.original): | |
if record.name in read_info: | |
length = read_info[record.name] | |
record.sequence = record.sequence[:length] | |
record.accuracy = record.accuracy[:length] | |
fp.write(output_single(record)) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import argparse | |
import screed | |
import khmer | |
from graphAlignment import align_long, AlignmentIndex | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('table') | |
parser.add_argument('ref') | |
args = parser.parse_args() | |
ct = khmer.load_counting_hash(args.table) | |
aligner = khmer.new_readaligner(ct, 5, 1.0) | |
for record in screed.open(args.ref): | |
seq = record.sequence | |
seq = seq.replace('N', 'A') | |
score, alignment = align_long(ct, aligner, seq) | |
g = alignment.g | |
r = alignment.r | |
m, n = alignment.compare() | |
print record.name, m, n, n - m, "%.3f%%" % (float(m)/ n * 100) | |
for start in range(0, len(alignment), 60): | |
print alignment[start:start+60] | |
if 0: | |
gidx = AlignmentIndex(alignment) | |
for gi, a, b in alignment.mismatches(): | |
print gi, a, b, gidx.get_ri(gi) | |
if 0: | |
print len(seq), alignment.refseqlen() | |
gidx._sanityCheck(seq) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import argparse | |
import screed | |
import khmer | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('table') | |
parser.add_argument('ref') | |
args = parser.parse_args() | |
ct = khmer.load_counting_hash(args.table) | |
aligner = khmer.new_readaligner(ct, 5, 1.0) | |
for record in screed.open(args.ref): | |
s = record.sequence | |
s = s.replace('N', 'A') | |
score, graph_alignment, read_alignment, truncated = \ | |
aligner.align(s) | |
#assert not truncated | |
g = graph_alignment #.replace('-', '') | |
r = read_alignment #.replace('-', '') | |
line1 = [] | |
line2 = [] | |
line3 = [] | |
for n, (a, b) in enumerate(zip(g, r)): | |
line1.append(a) | |
line3.append(b) | |
if a != b: | |
line2.append(' ') | |
else: | |
line2.append('|') | |
print '::', record.name, score, truncated | |
for start in range(0, len(line1), 60): | |
print "".join(line1[start:start+60]) | |
print "".join(line2[start:start+60]) | |
print "".join(line3[start:start+60]) | |
print '--' | |
#print record.name, ct.find_spectral_error_positions(s, 10) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import argparse | |
import screed | |
import khmer | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('table') | |
parser.add_argument('ref') | |
args = parser.parse_args() | |
ct = khmer.load_counting_hash(args.table) | |
for record in screed.open(args.ref): | |
s = record.sequence | |
s = s.replace('N', 'A') | |
for i in range(0, len(s) - ct.ksize() + 1): | |
kmer = s[i:i+ct.ksize()] | |
print i, ct.get(kmer) | |
print record.name, ct.find_spectral_error_positions(s, 10) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import string | |
import array | |
import khmer | |
import screed | |
__complementTranslation = string.maketrans('ACTGactg-=Nn', 'TGACtgac-=Nn') | |
REGIONSIZE = 50 | |
def reverse_complement(s): | |
""" | |
Build reverse complement of 's', alignment-aware. | |
""" | |
r = array.array('c', s) | |
r.reverse() | |
r = string.join(r, '') | |
c = string.translate(r, __complementTranslation) | |
return c | |
class GraphAlignment(object): | |
def __init__(self, g, r): | |
assert len(g) == len(r), (g, len(g), r, len(r)) | |
self.g = g.upper() | |
self.r = r.upper() | |
def reverse_complement(self): | |
return GraphAlignment(reverse_complement(self.g), | |
reverse_complement(self.r)) | |
rc = reverse_complement | |
def __add__(self, other): | |
return GraphAlignment(self.g + other.g, self.r + other.r) | |
def __len__(self): | |
return len(self.g) | |
def refseqlen(self): | |
r = self.r | |
return r.count('G') + r.count('C') + r.count('T') + r.count('A') | |
def __getitem__(self, i): | |
if isinstance(i, slice): | |
start, stop, step = i.indices(len(self.g)) | |
assert step == 1 | |
return GraphAlignment(self.g[start:stop], self.r[start:stop]) | |
return (self.g[i], self.r[i]) | |
def compare(self): | |
n = len(self.g) | |
m = n | |
for n, a, b in self.mismatches(): | |
m -= 1 | |
return m, n | |
def mismatches(self): | |
for n, (a, b) in enumerate(zip(self.g, self.r)): | |
if (a in '-=' or b in '-=' or a != b): | |
yield n, a, b | |
def __str__(self): | |
line1 = [] | |
line2 = [] | |
line3 = [] | |
for (a, b) in zip(self.g, self.r): | |
line1.append(a) | |
line3.append(b) | |
if a in '-=' or b in '-=' or a != b: | |
line2.append(' ') | |
else: | |
line2.append('|') | |
return "%s\n%s\n%s\n" % ("".join(line1), "".join(line2), "".join(line3)) | |
def make_gap(r): | |
return GraphAlignment('='*len(r), r) | |
def _index_alignment(galign, freq=100): | |
g_to_r = {} | |
r_to_g = {} | |
si = 0 | |
for gi, (a, b) in enumerate(zip(galign.g, galign.r)): | |
if gi % freq == 0: | |
g_to_r[gi] = si | |
if si % freq == 0: | |
r_to_g[si] = gi | |
if b in 'ACGT': | |
si += 1 | |
return g_to_r, r_to_g | |
class AlignmentIndex(object): | |
def __init__(self, galign, freq=100): | |
self.g_to_r, self.r_to_g = _index_alignment(galign, int(freq)) | |
self.galign = galign | |
self.freq = int(freq) | |
def get_gi(self, ri): | |
"Return alignment coordinates cooresponding to reference seq coords." | |
rpost = int(ri / self.freq) * self.freq | |
gpost = self.r_to_g[rpost] | |
diff = ri - rpost | |
gi = gpost | |
while diff > 0: | |
(a, b) = self.galign[gi] | |
if b in 'ACGT': | |
diff -= 1 | |
gi += 1 | |
# make sure it's on a valid letter ;) | |
while 1: | |
(a, b) = self.galign[gi] | |
if b in 'ACGT': | |
break | |
gi += 1 | |
return gi | |
def get_ri(self, gi): | |
"Return reference sequence coordinates from alignment coordinates." | |
gpost = int(gi / self.freq) * self.freq | |
ri = self.g_to_r[gpost] | |
diff = gi - gpost | |
while diff > 0: | |
(a, b) = self.galign[gpost] | |
diff -= 1 | |
gpost += 1 | |
if b in 'ACGT': | |
ri += 1 | |
return ri | |
def _sanityCheck(self, seq): | |
alignment = self.galign | |
for gi in range(len(alignment)): | |
if alignment[gi][1] in 'ACGT': | |
ri = self.get_ri(gi) | |
assert alignment[gi][1] == seq[ri] | |
for ri in range(len(seq)): | |
gi = self.get_gi(ri) | |
assert seq[ri] == alignment[gi][1] | |
def stitch(galign, K): | |
"""Stitch together multiple alignments, assuming all overlaps are | |
off-by-one on the right end.""" | |
ga = [] | |
ra = [] | |
len_so_far = 0 | |
n = 0 | |
for gg in galign[:-1]: | |
g = gg.g | |
r = gg.r | |
assert len(g) == len(r) | |
ga.append(g[:-K + 1]) | |
ra.append(r[:-K + 1]) | |
n += 1 | |
ga.append(galign[-1].g) | |
ra.append(galign[-1].r) | |
return GraphAlignment("".join(ga), "".join(ra)) | |
def align_segment_right(aligner, seq, next_ch=None): | |
score, g, r, truncated = aligner.align(seq) | |
galign = GraphAlignment(g, r) | |
# did it fail to align across entire segment? | |
if truncated: | |
aligned_length = galign.refseqlen() | |
unaligned_len = len(seq) - aligned_length | |
# if we are given next ch, try aligning backwards from next seed | |
if next_ch: | |
unaligned_seq = seq[-unaligned_len:] | |
sr, ggr = align_segment_left(aligner, unaligned_seq + next_ch) | |
galign += ggr[:-1] | |
score += sr # need to adjust score down... @@ | |
else: | |
# if not, just build a gap... | |
galign += make_gap(seq[-unaligned_len:]) | |
assert galign.refseqlen() == len(seq) | |
return score, galign | |
def align_segment_left(aligner, seq): | |
seq_rc = reverse_complement(seq) | |
score, galign = align_segment_right(aligner, seq_rc) | |
return score, galign.rc() | |
def align_long(ct, aligner, sta): | |
K = ct.ksize() | |
# first, pick seeds for each chunk | |
seeds = [] | |
for start in range(0, len(sta), REGIONSIZE): | |
region = sta[start:start + REGIONSIZE + K - 1] | |
if len(region) < K: | |
assert len(sta) - start < REGIONSIZE | |
break | |
seed_pos = find_highest_abund_kmer(ct, region) | |
seeds.append(start + seed_pos) | |
assert len(seeds) == len(set(seeds)) | |
# then, break between-seed intervals down into regions, starting at | |
# first seed. | |
region_coords = [] | |
for i in range(len(seeds) - 1): | |
seed_pos = seeds[i] | |
end_seed = seeds[i + 1] - 1 | |
region_coords.append((seed_pos, end_seed + K)) | |
# account for situation where last region is too small to align. | |
if len(sta) - seeds[-1] > K: | |
region_coords.append((seeds[-1], len(sta))) | |
else: | |
(last_seed, _) = region_coords.pop() | |
region_coords.append((last_seed, len(sta))) | |
# start building piecewise alignments, anchored by seeds. | |
alignments = [] | |
scores = [] | |
n = 0 | |
for (start, end) in region_coords[:-1]: | |
score, galign = align_segment_right(aligner, sta[start:end], | |
next_ch=sta[end]) | |
scores.append(score) | |
alignments.append(galign) | |
n += 1 | |
# deal with end (no possibility of alignment from right) | |
(start, end) = region_coords[-1] | |
score, galign = align_segment_right(aligner, sta[start:end]) | |
alignments.append(galign) | |
scores.append(score) | |
# deal with beginning, too: reverse align from first seed. | |
leftend = sta[0:seeds[0] + K] | |
score, galign = align_segment_left(aligner, leftend) | |
galign = galign[:-1] # trim off seed k-mer | |
alignments.insert(0, galign) | |
scores.insert(0, score) | |
# stitch all the alignments together | |
final = stitch(alignments, K) | |
return sum(scores), final | |
def find_highest_abund_kmer(ct, r): | |
pos = 0 | |
abund = ct.get(r[:ct.ksize()]) | |
for kstart in range(1, len(r) - ct.ksize() + 1): | |
kmer = r[kstart:kstart + ct.ksize()] | |
count = ct.get(kmer) | |
if count > abund: | |
pos = kstart | |
return pos | |
def test_1(): | |
ct = khmer.new_counting_hash(20, 1.1e6, 4) | |
ct.consume_fasta('simple-haplo-reads.fa.keep') | |
#ct = khmer.load_counting_hash('simple-haplo-reads.dn.ct') | |
aligner = khmer.new_readaligner(ct, 5, 1.0) | |
seq = "".join("""GTCCTGGCGGTCCCCATTCA | |
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT | |
GTAGGAAGTCACATGTGCTAAATATCAG | |
TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA | |
GCACAGTTTTTTATACGAAACCGTCTGCGTCAAGACGGGCCACATGGT | |
""".strip().split()) | |
score, alignment = align_long(ct, aligner, seq) | |
print len(seq), alignment.refseqlen() | |
for start in range(0, len(alignment), 60): | |
print alignment[start:start+60] | |
gidx = AlignmentIndex(alignment) | |
gidx._sanityCheck(seq) | |
def test_2(): | |
ct = khmer.new_counting_hash(20, 1.1e6, 4) | |
ct.consume_fasta('simple-haplo-reads.fa.keep') | |
aligner = khmer.new_readaligner(ct, 5, 1.0) | |
seq = "".join("""GTCCTGGCGGTCCCCATTCA | |
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT | |
GTAGGAAGTCACATGTGCTAAATATCAG | |
TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTAT | |
""".strip().split()) | |
seq = list(screed.open('simplefoo.fa'))[0].sequence | |
score, alignment = align_long(ct, aligner, seq) | |
print len(seq), alignment.refseqlen() | |
for start in range(0, len(alignment), 60): | |
print alignment[start:start+60] | |
gidx = AlignmentIndex(alignment) | |
gidx._sanityCheck(seq) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# evaluate against perfect? | |
QUAKE=/Users/t/Documents/papers/2014-streaming/Quake | |
all: ref-errors.details.txt corr-errors.details.txt | |
display: compare.txt refcompare.txt | |
ecoli-subset.fq: ecoli-mapped.fq.gz | |
sample-reads-randomly.py -R 1 -N 10000 ecoli-mapped.fq.gz -o ecoli-subset.fq | |
ecoli-subset.100k.fq: ecoli-mapped.fq.gz | |
sample-reads-randomly.py -R 2 -N 100000 ecoli-mapped.fq.gz -o ecoli-subset.100k.fq | |
ecoli-subset.100k.cor.fq.gz: ecoli-subset.100k.fq | |
echo ecoli-subset.100k.fq > ecoli_quake_list.txt | |
#ln -fs /usr/local/bin/jellyfish . | |
#cat ecoli-mapped.fq.gz.keep | $(QUAKE)/bin/count-qmers -q 33 -k 14 > ecoli_dn_counts.out | |
#$(QUAKE)/bin/cov_model.py ecoli_dn_counts.out > ecoli_dn_counts.cov | |
$(QUAKE)/bin/correct -f ecoli_quake_list.txt -p 4 -k 14 -q 33 -c 7.94 -z -m ecoli_dn_counts.out | |
#mv ecoli-mapped.cor.fq.gz ecoli-mapped.dn.cor.fq.gz | |
ecoli-subset.100k.quake.fq: ecoli-subset.100k.cor.fq.gz ecoli-subset.100k.fq | |
extract-original-reads-from-quake-cor.py ecoli-subset.100k.fq ecoli-subset.100k.cor.fq.gz ecoli-subset.100k.quake.fq | |
original.sam: ecoli-subset.fq | |
cat ecoli-subset.fq | bowtie2 -p 4 -x ecoli -U - -S original.sam | |
original.sam.pos: original.sam | |
./sam-scan.py ecoliMG1655.fa original.sam -o original.sam.pos | |
### | |
ecoli.dn.k13.kh: ecoli-mapped.fq.gz.keep.gz | |
load-into-counting.py -k 13 -x 4e7 ecoli.dn.k13.kh ecoli-mapped.fq.gz.keep.gz | |
corr.k13.C17.fq: ecoli.dn.k13.kh ecoli-subset.fq | |
../sandbox/error-correct-pass2.py ecoli.dn.k13.kh ecoli-subset.fq -o corr.k13.C17.fq --trusted-cov 17 | |
corr.k13.C17.sam: corr.k13.C17.fq | |
cat corr.k13.C17.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k13.C17.sam | |
corr.k13.C17.sam.pos: corr.k13.C17.sam | |
./sam-scan.py ecoliMG1655.fa corr.k13.C17.sam -o corr.k13.C17.sam.pos | |
### | |
ecoli.dn.k15.kh: ecoli-mapped.fq.gz.keep.gz | |
load-into-counting.py -k 15 -x 4e7 ecoli.dn.k15.kh ecoli-mapped.fq.gz.keep.gz | |
corr.k15.C15.fq: ecoli.dn.k15.kh ecoli-subset.fq | |
../sandbox/error-correct-pass2.py ecoli.dn.k15.kh ecoli-subset.fq -o corr.k15.C15.fq --trusted-cov 15 | |
corr.k15.C15.sam: corr.k15.C15.fq | |
cat corr.k15.C15.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k15.C15.sam | |
corr.k15.C15.sam.pos: corr.k15.C15.sam | |
./sam-scan.py ecoliMG1655.fa corr.k15.C15.sam -o corr.k15.C15.sam.pos | |
### | |
ecoli.dn.k17.kh: ecoli-mapped.fq.gz.keep.gz | |
load-into-counting.py -k 17 -x 4e7 ecoli.dn.k17.kh ecoli-mapped.fq.gz.keep.gz | |
corr.k17.C15.fq: ecoli.dn.k17.kh ecoli-subset.fq | |
../sandbox/error-correct-pass2.py ecoli.dn.k17.kh ecoli-subset.fq -o corr.k17.C15.fq --trusted-cov 15 | |
corr.k17.C15.sam: corr.k17.C15.fq | |
cat corr.k17.C15.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k17.C15.sam | |
corr.k17.C15.sam.pos: corr.k17.C15.sam | |
./sam-scan.py ecoliMG1655.fa corr.k17.C15.sam -o corr.k17.C15.sam.pos | |
### | |
ecoli.dn.k21.kh: ecoli-mapped.fq.gz.keep.gz | |
load-into-counting.py -k 21 -x 8e7 ecoli.dn.k21.kh ecoli-mapped.fq.gz.keep.gz | |
ecoli.dn.k23.kh: ecoli-mapped.fq.gz.keep.gz | |
load-into-counting.py -k 23 -x 8e7 ecoli.dn.k23.kh ecoli-mapped.fq.gz.keep.gz | |
ecoli.dn.k31.kh: ecoli-mapped.fq.gz.keep.gz | |
load-into-counting.py -k 31 -x 8e7 ecoli.dn.k31.kh ecoli-mapped.fq.gz.keep.gz | |
corr.k21.C5.fq: ecoli.dn.k21.kh ecoli-subset.fq | |
../sandbox/error-correct-pass2.py ecoli.dn.k21.kh ecoli-subset.fq -o corr.k21.C5.fq --trusted-cov 5 | |
corr.k21.C5.sam: corr.k21.C5.fq | |
cat corr.k21.C5.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k21.C5.sam | |
corr.k21.C5.sam.pos: corr.k21.C5.sam | |
./sam-scan.py ecoliMG1655.fa corr.k21.C5.sam -o corr.k21.C5.sam.pos | |
ecoli-ref.dn.k21.kh: ecoliMG1655.fa | |
load-into-counting.py -k 21 -x 1e7 ecoli-ref.dn.k21.kh ecoliMG1655.fa | |
refcorr.k21.fq: ecoli-ref.dn.k21.kh | |
../sandbox/error-correct-pass2.py ecoli-ref.dn.k21.kh ecoli-subset.fq -o refcorr.k21.fq --trusted-cov=1 | |
refcorr.k21.sam: refcorr.k21.fq | |
cat refcorr.k21.fq | bowtie2 -p 4 -x ecoli -U - -S refcorr.k21.sam | |
refcorr.k21.sam.pos: refcorr.k21.sam | |
./sam-scan.py ecoliMG1655.fa refcorr.k21.sam -o refcorr.k21.sam.pos | |
### | |
ref-errors.sam: ref-errors.fq | |
bowtie2 -p 4 -x ecoli -U ref-errors.fq -S ref-errors.sam | |
ref-errors.details.txt: ref-errors.sam | |
./sam-scan-details.py ecoliMG1655.fa ref-errors.sam ecoli-ref.dn.k21.kh -o ref-errors.details.txt -C 1 | |
corr-errors.sam: corr-errors.fq | |
bowtie2 -p 4 -x ecoli -U corr-errors.fq -S corr-errors.sam | |
corr-errors.details.txt: corr-errors.sam | |
./sam-scan-details.py ecoliMG1655.fa corr-errors.sam ecoli.dn.k21.kh -o corr-errors.details.txt -C 5 | |
### | |
test-set.sam: test-set.fq | |
bowtie2 -p 4 -x ecoli -U test-set.fq -S test-set.sam | |
test-set.sam.pos: test-set.sam | |
./sam-scan.py ecoliMG1655.fa test-set.sam -o test-set.sam.pos | |
test-set-report: test-set.sam.pos | |
./summarize-pos-file.py test-set.sam.pos test-set.fq | |
### | |
compare.txt: original.sam.pos corr.k13.C17.sam.pos corr.k15.C15.sam.pos \ | |
corr.k17.C15.sam.pos corr.k21.C5.sam.pos | |
./summarize-pos-file.py original.sam.pos ecoli-subset.fq | |
./summarize-pos-file.py corr.k13.C17.sam.pos corr.k13.C17.fq | |
./summarize-pos-file.py corr.k15.C15.sam.pos corr.k15.C15.fq | |
./summarize-pos-file.py corr.k17.C15.sam.pos corr.k17.C15.fq | |
./summarize-pos-file.py corr.k21.C5.sam.pos corr.k21.C5.fq --save-erroneous-to=corr-errors.fq | |
refcompare.txt: refcorr.k21.sam.pos | |
./summarize-pos-file.py original.sam.pos ecoli-subset.fq | |
./summarize-pos-file.py refcorr.k21.sam.pos refcorr.k21.fq --save-erroneous-to=ref-errors.fq | |
ecoli.1.bt2: ecoliMG1655.fa | |
bowtie2-build ecoliMG1655.fa ecoli | |
samtools faidx ecoliMG1655.fa | |
##### | |
NULLGRAPH=~/dev/nullgraph | |
simple-haplo-reads.fa: simple-haplo.fa | |
$(NULLGRAPH)/make-reads.py -C 100 -S 1 simple-haplo.fa > simple-haplo-reads.fa | |
simple-reads.fa: simple.fa | |
$(NULLGRAPH)/make-biased-reads.py -S 1 -C 100 simple.fa > simple-reads.fa | |
simple-reads.ct: simple-reads.fa | |
load-into-counting.py -k 20 -x 1.1e6 simple-reads.ct simple-reads.fa | |
simple-haplo-reads.dn.ct: simple-haplo-reads.fa | |
normalize-by-median.py -k 20 -C 20 simple-haplo-reads.fa -s simple-haplo-reads.dn.ct -x 1.1e6 | |
ex1: simple-haplo-reads.dn.ct | |
find-variant-by-align.py simple-haplo-reads.dn.ct simple-alt.fa | |
ex2: simple-reads.ct | |
count-by-align.py simple-reads.ct simple-haplo.fa > simple-haplo.counts | |
count-by-align.py simple-reads.ct simple-alt.fa > simple-alt.counts | |
ex3:simple-haplo-reads.dn.ct | |
find-variant-by-align.py simple-haplo-reads.dn.ct simple-long.fa | |
ex4:simple-haplo-reads.dn.ct | |
find-variant-by-align-long.py simple-haplo-reads.dn.ct simple-haplo.fa | |
ex5:simple-haplo-reads.dn.ct | |
find-variant-by-align-long.py simple-haplo-reads.dn.ct simple-long.fa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import argparse | |
import screed | |
import math | |
import khmer | |
def ignore_at(iter): | |
for item in iter: | |
if item.startswith('@'): | |
continue | |
yield item | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('genome') | |
parser.add_argument('samfile') | |
parser.add_argument('counts_table') | |
parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), | |
default=sys.stdout) | |
parser.add_argument('-C', '--cutoff', type=int, default=2) | |
args = parser.parse_args() | |
kh = khmer.load_counting_hash(args.counts_table) | |
genome_dict = dict([ (record.name, record.sequence) for record in \ | |
screed.open(args.genome) ]) | |
n = 0 | |
n_skipped = 0 | |
n_rev = n_fwd = 0 | |
for samline in ignore_at(open(args.samfile)): | |
n += 1 | |
if n % 100000 == 0: | |
print >>sys.stderr, '...', n | |
readname, flags, refname, refpos, _, _, _, _, _, seq = \ | |
samline.split('\t')[:10] | |
if refname == '*' or refpos == '*': | |
# (don't count these as skipped) | |
continue | |
refpos = int(refpos) | |
try: | |
ref = genome_dict[refname][refpos-1:refpos+len(seq) - 1] | |
except KeyError: | |
print >>sys.stderr, "unknown refname: %s; ignoring (read %s)" % (refname, readname) | |
n_skipped += 1 | |
continue | |
errors = [] | |
out1 = [] | |
out2 = [] | |
out3 = [] | |
out4 = [] | |
out5 = [] | |
for pos, (a, b) in enumerate(zip(ref, seq)): | |
if a != b: | |
# see http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output - '16' is the revcomp flag. | |
if int(flags) & 16: | |
pos = len(seq) - pos - 1 | |
n_rev += 1 | |
else: | |
n_fwd += 1 | |
errors.append(pos) | |
out1.append(a) | |
out2.append(b) | |
if a != b: | |
out3.append('x') | |
else: | |
out3.append(' ') | |
kmer = ref[pos:pos+kh.ksize()] | |
if len(kmer) == kh.ksize() and kh.get(kmer) < args.cutoff: | |
out4.append('|') | |
else: | |
out4.append(' ') | |
kmer = seq[pos:pos+kh.ksize()] | |
if len(kmer) == kh.ksize() and kh.get(kmer) < args.cutoff: | |
if kh.get(kmer) == 0: | |
out5.append('0') | |
else: | |
out5.append('<') | |
else: | |
out5.append(' ') | |
for start in range(0, len(out1), 60): | |
if start == 0: | |
print >>args.outfile, '%s positions %s:%s (len %d)' % \ | |
(readname, start, min(start+60, len(out1)), len(out1)) | |
else: | |
print >>args.outfile, '%s:%s (len %d)' % \ | |
(start, min(start+60, len(out1)), len(out1)) | |
print >>args.outfile, ' ref:', "".join(out1[start:start+60]) | |
print >>args.outfile, ' read:', "".join(out2[start:start+60]) | |
print >>args.outfile, ' mismatch:', "".join(out3[start:start+60]) | |
print >>args.outfile, ' ref kmer:', "".join(out4[start:start+60]) | |
print >>args.outfile, 'read kmer:', "".join(out5[start:start+60]) | |
print >>args.outfile, '' | |
print >>args.outfile, '-------' | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import argparse | |
import screed | |
import math | |
def ignore_at(iter): | |
for item in iter: | |
if item.startswith('@'): | |
continue | |
yield item | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('genome') | |
parser.add_argument('samfile') | |
parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), | |
default=sys.stdout) | |
args = parser.parse_args() | |
genome_dict = dict([ (record.name, record.sequence) for record in \ | |
screed.open(args.genome) ]) | |
n = 0 | |
n_skipped = 0 | |
n_rev = n_fwd = 0 | |
for samline in ignore_at(open(args.samfile)): | |
n += 1 | |
if n % 100000 == 0: | |
print >>sys.stderr, '...', n | |
readname, flags, refname, refpos, _, _, _, _, _, seq = \ | |
samline.split('\t')[:10] | |
if refname == '*' or refpos == '*': | |
# (don't count these as skipped) | |
continue | |
refpos = int(refpos) | |
try: | |
ref = genome_dict[refname][refpos-1:refpos+len(seq) - 1] | |
except KeyError: | |
print >>sys.stderr, "unknown refname: %s; ignoring (read %s)" % (refname, readname) | |
n_skipped += 1 | |
continue | |
errors = [] | |
for pos, (a, b) in enumerate(zip(ref, seq)): | |
if a != b: | |
# see http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output - '16' is the revcomp flag. | |
if int(flags) & 16: | |
pos = len(seq) - pos - 1 | |
n_rev += 1 | |
else: | |
n_fwd += 1 | |
errors.append(pos) | |
print >>args.outfile, readname, ",".join(map(str, errors)) | |
# avoid log errors via pseudocount | |
n_fwd += 1 | |
n_rev += 1 | |
print >>sys.stderr, 'logratio of fwd to rev: %.2f' % (math.log(n_fwd / float(n_rev), 2)) | |
if n_skipped / float(n) > .01: | |
raise Exception, "Error: too many reads ignored! %d of %d" % \ | |
(n_skipped, n) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>genome2 | |
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>genome1 | |
CTGCCATTGCCCCAACATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTCATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>genome3 | |
GTCCTGGCGGTCCCCATTCA | |
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT | |
GTAGGAAGTCACATGTGCTAAATATCAG | |
TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA | |
GCACAGTTTTTTATACGAAACCGTCTGCGTCAAGACGGGCCACATGGT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>genome 2 | |
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA | |
>genome2 1.5 | |
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTCATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import argparse | |
import screed | |
def output_single(read): | |
name = read.name | |
sequence = read.sequence | |
accuracy = None | |
if hasattr(read, 'accuracy'): | |
accuracy = read.accuracy | |
assert len(sequence) == len(accuracy), (sequence, accuracy) | |
return "@%s\n%s\n+\n%s\n" % (name, sequence, accuracy) | |
else: | |
return ">%s\n%s\n" % (name, sequence) | |
# read in list of error positions per read | |
def read_pos_file(filename, ignore_set=set()): | |
for line in open(filename): | |
line = line.strip() | |
try: | |
read, posns = line.split(' ', 2) | |
except ValueError: | |
read = line | |
posns = [] | |
if read in ignore_set: | |
posns = [] | |
elif posns: | |
if posns is 'V': # ignore variable coverage foo | |
ignore_set.add(read) | |
posns = [] | |
else: | |
posns = list(sorted(map(int, posns.split(',')))) | |
yield read, posns | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('posfile') | |
parser.add_argument('readsfile') | |
args = parser.parse_args() | |
posdict = dict(read_pos_file(args.posfile)) | |
out_1err = open(args.readsfile + '.err1', 'w') | |
out_2err = open(args.readsfile + '.err2', 'w') | |
out_3err = open(args.readsfile + '.err3', 'w') | |
out_4err = open(args.readsfile + '.err4', 'w') | |
for n, read in enumerate(screed.open(args.readsfile)): | |
errors = posdict.get(read.name, []) | |
if errors: | |
if len(errors) == 1: | |
out_1err.write(output_single(read)) | |
elif len(errors) == 2: | |
out_2err.write(output_single(read)) | |
elif len(errors) == 3: | |
out_3err.write(output_single(read)) | |
elif len(errors) >= 4: | |
out_4err.write(output_single(read)) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import argparse | |
import screed | |
def output_single(read): | |
name = read.name | |
sequence = read.sequence | |
accuracy = None | |
if hasattr(read, 'accuracy'): | |
accuracy = read.accuracy | |
assert len(sequence) == len(accuracy), (sequence, accuracy) | |
return "@%s\n%s\n+\n%s\n" % (name, sequence, accuracy) | |
else: | |
return ">%s\n%s\n" % (name, sequence) | |
def read_pos_file(filename): | |
for line in open(filename): | |
line = line.strip() | |
try: | |
read, posns = line.split(' ', 2) | |
posns = map(int, posns.split(',')) | |
except ValueError: | |
read = line | |
posns = [] | |
yield read, posns | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('posfile') | |
parser.add_argument('reads') | |
parser.add_argument('--limitreads', default=None) | |
parser.add_argument('--save-erroneous-to', dest='save_err_to', | |
help="output erroneous reads to this file", | |
type=argparse.FileType('w'), default=None) | |
args = parser.parse_args() | |
print 'reading files...', args.posfile, args.reads | |
posdict = dict(read_pos_file(args.posfile)) | |
limitnames = None | |
if args.limitreads: | |
limitnames = set([ readname for readname, _ in \ | |
read_pos_file(args.limitreads) ]) | |
all_reads = 0 | |
sum_bp = 0 | |
print 'reading sequences...' | |
for n, record in enumerate(screed.open(args.reads)): | |
if n % 100000 == 0: | |
print >>sys.stderr, '...', n | |
if args.limitreads and record.name not in limitnames: | |
continue | |
all_reads += 1 | |
sum_bp += len(record.sequence) | |
print 'done!' | |
n_reads = 0 | |
n = 0 | |
m = 0 | |
skipped = 0 | |
for k, v in posdict.iteritems(): | |
if args.limitreads and k not in limitnames: | |
skipped += 1 | |
continue | |
n_reads += 1 | |
if not v: | |
continue | |
n += 1 | |
m += len(v) | |
if args.save_err_to: | |
for nn, record in enumerate(screed.open(args.reads)): | |
if nn % 100000 == 0: | |
print >>sys.stderr, '... x save', nn | |
if posdict.get(record.name): | |
args.save_err_to.write(output_single(record)) | |
print 'XXX', all_reads, n_reads | |
print 'posfile %s: %d mutated reads of %d; %d mutations total' % \ | |
(args.posfile, n, n_reads, m) | |
print 'skipped:', skipped | |
print '%d bp total' % (sum_bp,) | |
print 'overall error rate: %f%%' % (100. * m / float(sum_bp)) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment