ctb/.gitignore

## .gitignore
*.bt2
*.sam
*.pos
*.fq
*.kh
*.kh.info
*.fai
*~
*.fq.gz
*-reads.fa
*.ct
*.ct.info
*.keep
*.pyc

## README.txt
Use with khmer branch test/ec_params.

## compare-pos.py
#! /usr/bin/env python
"""
Compare two lists of error positions ('pos files').

first file is assumed to be prediction, second file is assumed to be "truth."
"""
import argparse
import sys
import screed

def do_counting(first, second, ignore_set):
    n = 0                               # reads w/errors in first
    o = 0                               # reads w/errors in second

    matches = 0                         # exact matches
    mismatches = 0                      #
    n_ignored = 0                       # intentionally ignored

    for k, va in first.iteritems():
        if k in ignore_set:             # ignoreme
            n_ignored += 1
            continue

        vb = second.get(k)
        if not va and not vb:           # no errors! TN.
            continue

        # these conditions are handled elsewhere
        if va and not vb: # part of fp; see incorr_in_a intersect corr_in_b
            continue
        if vb and not va: # fn; see corr_in_a intersect incorr in b
            continue

        n += 1
        if va == vb:                    # exact match!
            matches += 1
        else:
            mismatches += 1             # no match :(

    for k, vb in second.iteritems():
        if k in ignore_set:             # ignoreme
            continue

        if len(vb):
            o += 1

    return n, o, matches, mismatches, n_ignored


# read in list of error positions per read
def read_pos_file(filename, ignore_set):
    for line in open(filename):
        line = line.strip()
        try:
            read, posns = line.split(' ', 2)
        except ValueError:
            read = line
            posns = []

        if read in ignore_set:
            posns = []
        elif posns:
            if posns is 'V':            # ignore variable coverage foo
                ignore_set.add(read)
                posns = []
            else:
                posns = list(sorted(map(int, posns.split(','))))

        yield read, posns


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-V', '--variable', default=False, action='store_true')
    parser.add_argument('-d', '--debug', default=False, action='store_true')
    parser.add_argument('pos_a')
    parser.add_argument('pos_b')
    parser.add_argument('reads')
    args = parser.parse_args()

    ignore_set = set() # reads to ignore because of low coverage

    # read in two lists of positions & errors
    print >>sys.stderr, 'loading posfiles'
    a = dict(read_pos_file(args.pos_a, ignore_set))
    b = dict(read_pos_file(args.pos_b, ignore_set))

    if not args.variable:               # eliminate ignore_set
        ignore_set = set()

    # get list of all reads
    print >>sys.stderr, 'loading reads from', args.reads
    all_names = set()

    total_reads = 0
    for record in screed.open(args.reads):
        total_reads += 1
        if record.name not in ignore_set:
            all_names.add(record.name)

    print >>sys.stderr, 'done!'

    # reads we pay attention to
    tracked_reads = len(all_names)
    n_ignored = total_reads - tracked_reads

    n, o, matches, mismatches, n_ignoredXX = do_counting(a, b, ignore_set)

    print 'total # of reads: %d' % (total_reads,)
    print 'IGNORED due to -V: %d (%d, %.2f%%)' % (n_ignored,
                                                  tracked_reads,
               tracked_reads / float(total_reads) * 100.)
    print 'total # of tracked reads:', tracked_reads

    print '%d erroneous reads in %s' % (n, args.pos_a)
    print '%d erroneous reads in %s' % (o, args.pos_b)
    print '%d reads in common => all error positions AGREE' % (matches,)
    print '%d DISAGREE' % (mismatches,)

    # assume a is prediction, b is correct

    incorrect_in_a = set([ k for k in a if a[k] ])
    incorrect_in_b = set([ k for k in b if b[k] ])
    correct_in_a = all_names - set([ k for k in a if a[k] ])
    correct_in_b = all_names - set([ k for k in b if b[k] ])

    if args.debug:
        print 'DDD incorrect in a:', len(incorrect_in_a)
        print 'DDD incorrect in b:', len(incorrect_in_b)
        print 'DDD correct in a:', len(correct_in_a)
        print 'DDD correct in b:', len(correct_in_b)

    # m is reads thought to be erroneous by a, agree exactly with b
    tp = matches

    # fp: reads thought to be erroneous by a, but actually correct (in b)
    #     + places where a called errors that were incorrect
    fp = mismatches + len(incorrect_in_a.intersection(correct_in_b))

    if args.debug:
        print 'DDD incorrect in a, correct in b:', \
              incorrect_in_a.intersection(correct_in_b)

    # fn: reads through to be correct by a, but actually erroneous (in b)
    fn = len(correct_in_a.intersection(incorrect_in_b))

    # tn: reads thought to be correct in both a and b
    tn = len(correct_in_a.intersection(correct_in_b))

    print 'TP:', tp
    print 'TN:', tn
    print 'FP: %d (%d + %d)' % (fp,
                                mismatches,
                                len(incorrect_in_a.intersection(correct_in_b)))
    print 'FN:', fn

    print 'sensitivity:', tp / float(tp + fn)
    print 'specificity:', tp / float(tn + fp)

    assert n <= total_reads
    assert o <= total_reads
    assert n_ignored == n_ignoredXX

    assert len(all_names) == tp + tn + fp + fn, \
           (len(all_names) - (tp+tn+fp+fn),
            len(all_names),
            tp,
            tn,
            fp,
            fn)

if __name__ == '__main__':
    main()

## count-by-align.py
#! /usr/bin/env python
import sys
import argparse
import screed
import khmer


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.new_readaligner(ct, 5, 1.0)

    for record in screed.open(args.ref):
        s = record.sequence
        s = s.replace('N', 'A')

        score, graph_alignment, read_alignment, truncated = \
               aligner.align(s)

        assert not truncated

        g = graph_alignment.replace('-', '')
        r = read_alignment.replace('-', '')

        print record.name
        for kstart in range(0, len(g) - ct.ksize() + 1):
            kmer = g[kstart:kstart+ct.ksize()]
            print kstart, ct.get(kmer)


if __name__ == '__main__':
    main()

## ecoliMG1655.fa
/Users/t/Documents/papers/2014-streaming/pipeline/ecoliMG1655.fa

## extract-original-reads-from-quake-cor.py
#! /usr/bin/env python
import screed
import argparse


def output_single(read):
    if hasattr(read, 'accuracy'):
        return "@%s\n%s\n+\n%s\n" % (read.name, read.sequence, read.accuracy)
    else:
        return ">%s\n%s\n" % (read.name, read.sequence)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('original')
    parser.add_argument('quake')
    parser.add_argument('output')
    args = parser.parse_args()

    read_info = {}
    for record in screed.open(args.quake):
        read_info[record.name] = len(record.sequence)

    fp = open(args.output, 'w')

    for record in screed.open(args.original):
        if record.name in read_info:
            length = read_info[record.name]
            record.sequence = record.sequence[:length]
            record.accuracy = record.accuracy[:length]
            fp.write(output_single(record))

if __name__ == '__main__':
    main()

## find-variant-by-align-long.py
#! /usr/bin/env python
import sys
import argparse
import screed
import khmer
from graphAlignment import align_long, AlignmentIndex


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.new_readaligner(ct, 5, 1.0)

    for record in screed.open(args.ref):
        seq = record.sequence
        seq = seq.replace('N', 'A')

        score, alignment = align_long(ct, aligner, seq)

        g = alignment.g
        r = alignment.r

        m, n = alignment.compare()
        print record.name, m, n, n - m, "%.3f%%" % (float(m)/ n * 100)
        for start in range(0, len(alignment), 60):
            print alignment[start:start+60]

        if 0:
            gidx = AlignmentIndex(alignment)
            for gi, a, b in alignment.mismatches():
                print gi, a, b, gidx.get_ri(gi)

        if 0:
            print len(seq), alignment.refseqlen()
            gidx._sanityCheck(seq)

if __name__ == '__main__':
    main()

## find-variant-by-align.py
#! /usr/bin/env python
import sys
import argparse
import screed
import khmer


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.new_readaligner(ct, 5, 1.0)

    for record in screed.open(args.ref):
        s = record.sequence
        s = s.replace('N', 'A')

        score, graph_alignment, read_alignment, truncated = \
               aligner.align(s)

        #assert not truncated

        g = graph_alignment #.replace('-', '')
        r = read_alignment  #.replace('-', '')

        line1 = []
        line2 = []
        line3 = []
        for n, (a, b) in enumerate(zip(g, r)):
            line1.append(a)
            line3.append(b)
            if a != b:
                line2.append(' ')
            else:
                line2.append('|')

        print '::', record.name, score, truncated
        for start in range(0, len(line1), 60):
            print "".join(line1[start:start+60])
            print "".join(line2[start:start+60])
            print "".join(line3[start:start+60])
            print '--'

        #print record.name, ct.find_spectral_error_positions(s, 10)

if __name__ == '__main__':
    main()

## find-variant-by-spectra.py
#! /usr/bin/env python
import sys
import argparse
import screed
import khmer


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)

    for record in screed.open(args.ref):
        s = record.sequence
        s = s.replace('N', 'A')

        for i in range(0, len(s) - ct.ksize() + 1):
            kmer = s[i:i+ct.ksize()]
            print i, ct.get(kmer)

        print record.name, ct.find_spectral_error_positions(s, 10)

if __name__ == '__main__':
    main()

## graphAlignment.py
import sys
import string
import array
import khmer
import screed

__complementTranslation = string.maketrans('ACTGactg-=Nn', 'TGACtgac-=Nn')


REGIONSIZE = 50


def reverse_complement(s):
    """
    Build reverse complement of 's', alignment-aware.
    """
    r = array.array('c', s)
    r.reverse()
    r = string.join(r, '')
    c = string.translate(r, __complementTranslation)

    return c


class GraphAlignment(object):
    def __init__(self, g, r):
        assert len(g) == len(r), (g, len(g), r, len(r))
        self.g = g.upper()
        self.r = r.upper()

    def reverse_complement(self):
        return GraphAlignment(reverse_complement(self.g),
                              reverse_complement(self.r))

    rc = reverse_complement

    def __add__(self, other):
        return GraphAlignment(self.g + other.g, self.r + other.r)

    def __len__(self):
        return len(self.g)

    def refseqlen(self):
        r = self.r
        return r.count('G') + r.count('C') + r.count('T') + r.count('A')

    def __getitem__(self, i):
        if isinstance(i, slice):
            start, stop, step = i.indices(len(self.g))
            assert step == 1
            return GraphAlignment(self.g[start:stop], self.r[start:stop])

        return (self.g[i], self.r[i])

    def compare(self):
        n = len(self.g)
        m = n

        for n, a, b in self.mismatches():
            m -= 1

        return m, n

    def mismatches(self):
        for n, (a, b) in enumerate(zip(self.g, self.r)):
            if (a in '-=' or b in '-=' or a != b):
                yield n, a, b

    def __str__(self):
        line1 = []
        line2 = []
        line3 = []

        for (a, b) in zip(self.g, self.r):
            line1.append(a)
            line3.append(b)

            if a in '-=' or b in '-=' or a != b:
                line2.append(' ')
            else:
                line2.append('|')

        return "%s\n%s\n%s\n" % ("".join(line1), "".join(line2), "".join(line3))


def make_gap(r):
    return GraphAlignment('='*len(r), r)


def _index_alignment(galign, freq=100):
    g_to_r = {}
    r_to_g = {}

    si = 0
    for gi, (a, b) in enumerate(zip(galign.g, galign.r)):
        if gi % freq == 0:
            g_to_r[gi] = si
        if si % freq == 0:
            r_to_g[si] = gi

        if b in 'ACGT':
            si += 1

    return g_to_r, r_to_g


class AlignmentIndex(object):
    def __init__(self, galign, freq=100):
        self.g_to_r, self.r_to_g = _index_alignment(galign, int(freq))
        self.galign = galign
        self.freq = int(freq)

    def get_gi(self, ri):
        "Return alignment coordinates cooresponding to reference seq coords."
        rpost = int(ri / self.freq) * self.freq
        gpost = self.r_to_g[rpost]

        diff = ri - rpost
        gi = gpost

        while diff > 0:
            (a, b) = self.galign[gi]
            if b in 'ACGT':
                diff -= 1
            gi += 1

        # make sure it's on a valid letter ;)
        while 1:
            (a, b) = self.galign[gi]
            if b in 'ACGT':
                break
            gi += 1

        return gi

    def get_ri(self, gi):
        "Return reference sequence coordinates from alignment coordinates."
        gpost = int(gi / self.freq) * self.freq
        ri = self.g_to_r[gpost]

        diff = gi - gpost
        while diff > 0:
            (a, b) = self.galign[gpost]
            diff -= 1
            gpost += 1
            if b in 'ACGT':
                ri += 1

        return ri

    def _sanityCheck(self, seq):
        alignment = self.galign

        for gi in range(len(alignment)):
            if alignment[gi][1] in 'ACGT':
                ri = self.get_ri(gi)
                assert alignment[gi][1] == seq[ri]

        for ri in range(len(seq)):
            gi = self.get_gi(ri)
            assert seq[ri] == alignment[gi][1]


def stitch(galign, K):
    """Stitch together multiple alignments, assuming all overlaps are
    off-by-one on the right end."""
    ga = []
    ra = []

    len_so_far = 0

    n = 0
    for gg in galign[:-1]:
        g = gg.g
        r = gg.r
        assert len(g) == len(r)

        ga.append(g[:-K + 1])
        ra.append(r[:-K + 1])

        n += 1

    ga.append(galign[-1].g)
    ra.append(galign[-1].r)

    return GraphAlignment("".join(ga), "".join(ra))


def align_segment_right(aligner, seq, next_ch=None):
    score, g, r, truncated = aligner.align(seq)
    galign = GraphAlignment(g, r)

    # did it fail to align across entire segment?
    if truncated:
        aligned_length = galign.refseqlen()
        unaligned_len = len(seq) - aligned_length

        # if we are given next ch, try aligning backwards from next seed
        if next_ch:
            unaligned_seq = seq[-unaligned_len:]
            sr, ggr = align_segment_left(aligner, unaligned_seq + next_ch)
            galign += ggr[:-1]
            score += sr                 # need to adjust score down... @@
        else:
            # if not, just build a gap...
            galign += make_gap(seq[-unaligned_len:])

    assert galign.refseqlen() == len(seq)

    return score, galign


def align_segment_left(aligner, seq):
    seq_rc = reverse_complement(seq)
    score, galign = align_segment_right(aligner, seq_rc)
    return score, galign.rc()


def align_long(ct, aligner, sta):
    K = ct.ksize()

    # first, pick seeds for each chunk
    seeds = []
    for start in range(0, len(sta), REGIONSIZE):
        region = sta[start:start + REGIONSIZE + K - 1]
        if len(region) < K:
            assert len(sta) - start < REGIONSIZE
            break
        seed_pos = find_highest_abund_kmer(ct, region)
        seeds.append(start + seed_pos)

    assert len(seeds) == len(set(seeds))

    # then, break between-seed intervals down into regions, starting at
    # first seed.
    region_coords = []
    for i in range(len(seeds) - 1):
        seed_pos = seeds[i]
        end_seed = seeds[i + 1] - 1
        region_coords.append((seed_pos, end_seed + K))

    # account for situation where last region is too small to align.
    if len(sta) - seeds[-1] > K:
        region_coords.append((seeds[-1], len(sta)))
    else:
        (last_seed, _) = region_coords.pop()
        region_coords.append((last_seed, len(sta)))

    # start building piecewise alignments, anchored by seeds.
    alignments = []
    scores = []

    n = 0
    for (start, end) in region_coords[:-1]:
        score, galign = align_segment_right(aligner, sta[start:end],
                                            next_ch=sta[end])
        scores.append(score)
        alignments.append(galign)

        n += 1

    # deal with end (no possibility of alignment from right)
    (start, end) = region_coords[-1]
    score, galign = align_segment_right(aligner, sta[start:end])

    alignments.append(galign)
    scores.append(score)

    # deal with beginning, too: reverse align from first seed.
    leftend = sta[0:seeds[0] + K]
    score, galign = align_segment_left(aligner, leftend)

    galign = galign[:-1]                # trim off seed k-mer
    alignments.insert(0, galign)
    scores.insert(0, score)

    # stitch all the alignments together
    final = stitch(alignments, K)

    return sum(scores), final


def find_highest_abund_kmer(ct, r):

    pos = 0
    abund = ct.get(r[:ct.ksize()])
    for kstart in range(1, len(r) - ct.ksize() + 1):
        kmer = r[kstart:kstart + ct.ksize()]
        count = ct.get(kmer)

        if count > abund:
            pos = kstart

    return pos


def test_1():
    ct = khmer.new_counting_hash(20, 1.1e6, 4)
    ct.consume_fasta('simple-haplo-reads.fa.keep')
    #ct = khmer.load_counting_hash('simple-haplo-reads.dn.ct')
    aligner = khmer.new_readaligner(ct, 5, 1.0)

    seq = "".join("""GTCCTGGCGGTCCCCATTCA
    CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
    GTAGGAAGTCACATGTGCTAAATATCAG
    TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
    GCACAGTTTTTTATACGAAACCGTCTGCGTCAAGACGGGCCACATGGT
    """.strip().split())

    score, alignment = align_long(ct, aligner, seq)

    print len(seq), alignment.refseqlen()

    for start in range(0, len(alignment), 60):
        print alignment[start:start+60]


    gidx = AlignmentIndex(alignment)
    gidx._sanityCheck(seq)


def test_2():
    ct = khmer.new_counting_hash(20, 1.1e6, 4)
    ct.consume_fasta('simple-haplo-reads.fa.keep')
    aligner = khmer.new_readaligner(ct, 5, 1.0)

    seq = "".join("""GTCCTGGCGGTCCCCATTCA
    CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
    GTAGGAAGTCACATGTGCTAAATATCAG
    TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTAT
    """.strip().split())

    seq = list(screed.open('simplefoo.fa'))[0].sequence

    score, alignment = align_long(ct, aligner, seq)

    print len(seq), alignment.refseqlen()

    for start in range(0, len(alignment), 60):
        print alignment[start:start+60]

    gidx = AlignmentIndex(alignment)
    gidx._sanityCheck(seq)

## haplo.ipynb.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              haplo.ipynb.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Makefile
# evaluate against perfect?
QUAKE=/Users/t/Documents/papers/2014-streaming/Quake

all: ref-errors.details.txt corr-errors.details.txt

display: compare.txt refcompare.txt

ecoli-subset.fq: ecoli-mapped.fq.gz
	sample-reads-randomly.py -R 1 -N 10000 ecoli-mapped.fq.gz -o ecoli-subset.fq

ecoli-subset.100k.fq: ecoli-mapped.fq.gz
	sample-reads-randomly.py -R 2 -N 100000 ecoli-mapped.fq.gz -o ecoli-subset.100k.fq

ecoli-subset.100k.cor.fq.gz: ecoli-subset.100k.fq
	echo ecoli-subset.100k.fq > ecoli_quake_list.txt
	#ln -fs /usr/local/bin/jellyfish .
	#cat ecoli-mapped.fq.gz.keep | $(QUAKE)/bin/count-qmers -q 33 -k 14 > ecoli_dn_counts.out
	#$(QUAKE)/bin/cov_model.py ecoli_dn_counts.out > ecoli_dn_counts.cov
	$(QUAKE)/bin/correct -f ecoli_quake_list.txt -p 4 -k 14 -q 33 -c 7.94 -z -m ecoli_dn_counts.out
	#mv ecoli-mapped.cor.fq.gz ecoli-mapped.dn.cor.fq.gz

ecoli-subset.100k.quake.fq: ecoli-subset.100k.cor.fq.gz ecoli-subset.100k.fq
	extract-original-reads-from-quake-cor.py ecoli-subset.100k.fq ecoli-subset.100k.cor.fq.gz ecoli-subset.100k.quake.fq


original.sam: ecoli-subset.fq
	cat ecoli-subset.fq | bowtie2 -p 4 -x ecoli -U - -S original.sam

original.sam.pos: original.sam
	./sam-scan.py ecoliMG1655.fa original.sam -o original.sam.pos

###

ecoli.dn.k13.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 13 -x 4e7 ecoli.dn.k13.kh ecoli-mapped.fq.gz.keep.gz

corr.k13.C17.fq: ecoli.dn.k13.kh ecoli-subset.fq
	../sandbox/error-correct-pass2.py ecoli.dn.k13.kh ecoli-subset.fq -o corr.k13.C17.fq --trusted-cov 17

corr.k13.C17.sam: corr.k13.C17.fq
	cat corr.k13.C17.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k13.C17.sam

corr.k13.C17.sam.pos: corr.k13.C17.sam
	./sam-scan.py ecoliMG1655.fa corr.k13.C17.sam -o corr.k13.C17.sam.pos

###

ecoli.dn.k15.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 15 -x 4e7 ecoli.dn.k15.kh ecoli-mapped.fq.gz.keep.gz

corr.k15.C15.fq: ecoli.dn.k15.kh ecoli-subset.fq
	../sandbox/error-correct-pass2.py ecoli.dn.k15.kh ecoli-subset.fq -o corr.k15.C15.fq --trusted-cov 15

corr.k15.C15.sam: corr.k15.C15.fq
	cat corr.k15.C15.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k15.C15.sam

corr.k15.C15.sam.pos: corr.k15.C15.sam
	./sam-scan.py ecoliMG1655.fa corr.k15.C15.sam -o corr.k15.C15.sam.pos

###

ecoli.dn.k17.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 17 -x 4e7 ecoli.dn.k17.kh ecoli-mapped.fq.gz.keep.gz

corr.k17.C15.fq: ecoli.dn.k17.kh ecoli-subset.fq
	../sandbox/error-correct-pass2.py ecoli.dn.k17.kh ecoli-subset.fq -o corr.k17.C15.fq --trusted-cov 15

corr.k17.C15.sam: corr.k17.C15.fq
	cat corr.k17.C15.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k17.C15.sam

corr.k17.C15.sam.pos: corr.k17.C15.sam
	./sam-scan.py ecoliMG1655.fa corr.k17.C15.sam -o corr.k17.C15.sam.pos

###

ecoli.dn.k21.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 21 -x 8e7 ecoli.dn.k21.kh ecoli-mapped.fq.gz.keep.gz

ecoli.dn.k23.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 23 -x 8e7 ecoli.dn.k23.kh ecoli-mapped.fq.gz.keep.gz

ecoli.dn.k31.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 31 -x 8e7 ecoli.dn.k31.kh ecoli-mapped.fq.gz.keep.gz

corr.k21.C5.fq: ecoli.dn.k21.kh ecoli-subset.fq
	../sandbox/error-correct-pass2.py ecoli.dn.k21.kh ecoli-subset.fq -o corr.k21.C5.fq --trusted-cov 5

corr.k21.C5.sam: corr.k21.C5.fq
	cat corr.k21.C5.fq | bowtie2 -p 4 -x ecoli -U - -S corr.k21.C5.sam

corr.k21.C5.sam.pos: corr.k21.C5.sam
	./sam-scan.py ecoliMG1655.fa corr.k21.C5.sam -o corr.k21.C5.sam.pos

ecoli-ref.dn.k21.kh: ecoliMG1655.fa
	load-into-counting.py -k 21 -x 1e7 ecoli-ref.dn.k21.kh ecoliMG1655.fa

refcorr.k21.fq: ecoli-ref.dn.k21.kh
	../sandbox/error-correct-pass2.py ecoli-ref.dn.k21.kh ecoli-subset.fq -o refcorr.k21.fq --trusted-cov=1

refcorr.k21.sam: refcorr.k21.fq
	cat refcorr.k21.fq | bowtie2 -p 4 -x ecoli -U - -S refcorr.k21.sam

refcorr.k21.sam.pos: refcorr.k21.sam
	./sam-scan.py ecoliMG1655.fa refcorr.k21.sam -o refcorr.k21.sam.pos

###

ref-errors.sam: ref-errors.fq
	bowtie2 -p 4 -x ecoli -U ref-errors.fq -S ref-errors.sam

ref-errors.details.txt: ref-errors.sam
	./sam-scan-details.py ecoliMG1655.fa ref-errors.sam ecoli-ref.dn.k21.kh -o ref-errors.details.txt -C 1

corr-errors.sam: corr-errors.fq
	bowtie2 -p 4 -x ecoli -U corr-errors.fq -S corr-errors.sam

corr-errors.details.txt: corr-errors.sam
	./sam-scan-details.py ecoliMG1655.fa corr-errors.sam ecoli.dn.k21.kh -o corr-errors.details.txt -C 5

###

test-set.sam: test-set.fq
	bowtie2 -p 4 -x ecoli -U test-set.fq -S test-set.sam

test-set.sam.pos: test-set.sam
	./sam-scan.py ecoliMG1655.fa test-set.sam -o test-set.sam.pos

test-set-report: test-set.sam.pos
	./summarize-pos-file.py test-set.sam.pos test-set.fq

###


compare.txt: original.sam.pos corr.k13.C17.sam.pos corr.k15.C15.sam.pos \
	corr.k17.C15.sam.pos corr.k21.C5.sam.pos
	./summarize-pos-file.py original.sam.pos ecoli-subset.fq
	./summarize-pos-file.py corr.k13.C17.sam.pos corr.k13.C17.fq
	./summarize-pos-file.py corr.k15.C15.sam.pos corr.k15.C15.fq
	./summarize-pos-file.py corr.k17.C15.sam.pos corr.k17.C15.fq
	./summarize-pos-file.py corr.k21.C5.sam.pos corr.k21.C5.fq --save-erroneous-to=corr-errors.fq

refcompare.txt: refcorr.k21.sam.pos
	./summarize-pos-file.py original.sam.pos ecoli-subset.fq
	./summarize-pos-file.py refcorr.k21.sam.pos refcorr.k21.fq --save-erroneous-to=ref-errors.fq

ecoli.1.bt2: ecoliMG1655.fa
	bowtie2-build ecoliMG1655.fa ecoli
	samtools faidx ecoliMG1655.fa

#####

NULLGRAPH=~/dev/nullgraph

simple-haplo-reads.fa: simple-haplo.fa
	$(NULLGRAPH)/make-reads.py -C 100 -S 1 simple-haplo.fa > simple-haplo-reads.fa

simple-reads.fa: simple.fa
	$(NULLGRAPH)/make-biased-reads.py -S 1 -C 100 simple.fa > simple-reads.fa

simple-reads.ct: simple-reads.fa
	load-into-counting.py -k 20 -x 1.1e6 simple-reads.ct simple-reads.fa

simple-haplo-reads.dn.ct: simple-haplo-reads.fa
	normalize-by-median.py -k 20 -C 20 simple-haplo-reads.fa -s simple-haplo-reads.dn.ct -x 1.1e6

ex1: simple-haplo-reads.dn.ct
	find-variant-by-align.py simple-haplo-reads.dn.ct simple-alt.fa

ex2: simple-reads.ct
	count-by-align.py simple-reads.ct simple-haplo.fa > simple-haplo.counts
	count-by-align.py simple-reads.ct simple-alt.fa > simple-alt.counts

ex3:simple-haplo-reads.dn.ct
	find-variant-by-align.py simple-haplo-reads.dn.ct simple-long.fa

ex4:simple-haplo-reads.dn.ct
	find-variant-by-align-long.py simple-haplo-reads.dn.ct simple-haplo.fa

ex5:simple-haplo-reads.dn.ct
	find-variant-by-align-long.py simple-haplo-reads.dn.ct simple-long.fa

## sam-scan-details.py
#! /usr/bin/env python
import sys
import argparse
import screed
import math
import khmer

def ignore_at(iter):
    for item in iter:
        if item.startswith('@'):
            continue
        yield item

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('genome')
    parser.add_argument('samfile')
    parser.add_argument('counts_table')
    parser.add_argument('-o', '--outfile', type=argparse.FileType('w'),
                        default=sys.stdout)
    parser.add_argument('-C', '--cutoff', type=int, default=2)

    args = parser.parse_args()

    kh = khmer.load_counting_hash(args.counts_table)

    genome_dict = dict([ (record.name, record.sequence) for record in \
                        screed.open(args.genome) ])

    n = 0
    n_skipped = 0
    n_rev = n_fwd = 0

    for samline in ignore_at(open(args.samfile)):
        n += 1
        if n % 100000 == 0:
            print >>sys.stderr, '...', n

        readname, flags, refname, refpos, _, _, _, _, _, seq = \
                  samline.split('\t')[:10]
        if refname == '*' or refpos == '*':
            # (don't count these as skipped)
            continue

        refpos = int(refpos)
        try:
            ref = genome_dict[refname][refpos-1:refpos+len(seq) - 1]
        except KeyError:
            print >>sys.stderr, "unknown refname: %s; ignoring (read %s)" % (refname, readname)
            n_skipped += 1
            continue

        errors = []

        out1 = []
        out2 = []
        out3 = []
        out4 = []
        out5 = []

        for pos, (a, b) in enumerate(zip(ref, seq)):
            if a != b:
                # see http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output - '16' is the revcomp flag.
                if int(flags) & 16:
                    pos = len(seq) - pos - 1
                    n_rev += 1
                else:
                    n_fwd += 1
                errors.append(pos)

            out1.append(a)
            out2.append(b)
            if a != b:
                out3.append('x')
            else:
                out3.append(' ')

            kmer = ref[pos:pos+kh.ksize()]
            if len(kmer) == kh.ksize() and kh.get(kmer) < args.cutoff:
                out4.append('|')
            else:
                out4.append(' ')

            kmer = seq[pos:pos+kh.ksize()]
            if len(kmer) == kh.ksize() and kh.get(kmer) < args.cutoff:
                if kh.get(kmer) == 0:
                    out5.append('0')
                else:
                    out5.append('<')
            else:
                out5.append(' ')

        for start in range(0, len(out1), 60):
            if start == 0:
                print >>args.outfile, '%s positions %s:%s (len %d)' % \
                      (readname, start, min(start+60, len(out1)), len(out1))
            else:
                print >>args.outfile, '%s:%s (len %d)' % \
                      (start, min(start+60, len(out1)), len(out1))

            print >>args.outfile, '      ref:', "".join(out1[start:start+60])
            print >>args.outfile, '     read:', "".join(out2[start:start+60])
            print >>args.outfile, ' mismatch:', "".join(out3[start:start+60])
            print >>args.outfile, ' ref kmer:', "".join(out4[start:start+60])
            print >>args.outfile, 'read kmer:', "".join(out5[start:start+60])
            print >>args.outfile, ''
        print >>args.outfile, '-------'


if __name__ == '__main__':
    main()

## sam-scan.py
#! /usr/bin/env python
import sys
import argparse
import screed
import math

def ignore_at(iter):
    for item in iter:
        if item.startswith('@'):
            continue
        yield item

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('genome')
    parser.add_argument('samfile')
    parser.add_argument('-o', '--outfile', type=argparse.FileType('w'),
                        default=sys.stdout)

    args = parser.parse_args()

    genome_dict = dict([ (record.name, record.sequence) for record in \
                        screed.open(args.genome) ])

    n = 0
    n_skipped = 0
    n_rev = n_fwd = 0

    for samline in ignore_at(open(args.samfile)):
        n += 1
        if n % 100000 == 0:
            print >>sys.stderr, '...', n

        readname, flags, refname, refpos, _, _, _, _, _, seq = \
                  samline.split('\t')[:10]
        if refname == '*' or refpos == '*':
            # (don't count these as skipped)
            continue

        refpos = int(refpos)
        try:
            ref = genome_dict[refname][refpos-1:refpos+len(seq) - 1]
        except KeyError:
            print >>sys.stderr, "unknown refname: %s; ignoring (read %s)" % (refname, readname)
            n_skipped += 1
            continue

        errors = []
        for pos, (a, b) in enumerate(zip(ref, seq)):
            if a != b:
                # see http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output - '16' is the revcomp flag.
                if int(flags) & 16:
                    pos = len(seq) - pos - 1
                    n_rev += 1
                else:
                    n_fwd += 1
                errors.append(pos)

        print >>args.outfile, readname, ",".join(map(str, errors))

    # avoid log errors via pseudocount
    n_fwd += 1
    n_rev += 1

    print >>sys.stderr, 'logratio of fwd to rev: %.2f' % (math.log(n_fwd / float(n_rev), 2))
    if n_skipped / float(n) > .01:
        raise Exception, "Error: too many reads ignored! %d of %d" % \
              (n_skipped, n)

if __name__ == '__main__':
    main()

## simple-alt.fa
>genome2
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA

## simple-haplo.fa
>genome1
CTGCCATTGCCCCAACATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTCATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA

## simple-long.fa
>genome3
GTCCTGGCGGTCCCCATTCA
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
GTAGGAAGTCACATGTGCTAAATATCAG
TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
GCACAGTTTTTTATACGAAACCGTCTGCGTCAAGACGGGCCACATGGT

## simple.fa
>genome 2
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
>genome2 1.5
CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTCATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA

## split-reads-by-num-errors.py
#! /usr/bin/env python
import argparse
import screed


def output_single(read):
    name = read.name
    sequence = read.sequence

    accuracy = None
    if hasattr(read, 'accuracy'):
        accuracy = read.accuracy
        assert len(sequence) == len(accuracy), (sequence, accuracy)
        return "@%s\n%s\n+\n%s\n" % (name, sequence, accuracy)
    else:
        return ">%s\n%s\n" % (name, sequence)


# read in list of error positions per read
def read_pos_file(filename, ignore_set=set()):
    for line in open(filename):
        line = line.strip()
        try:
            read, posns = line.split(' ', 2)
        except ValueError:
            read = line
            posns = []

        if read in ignore_set:
            posns = []
        elif posns:
            if posns is 'V':            # ignore variable coverage foo
                ignore_set.add(read)
                posns = []
            else:
                posns = list(sorted(map(int, posns.split(','))))

        yield read, posns


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('posfile')
    parser.add_argument('readsfile')
    args = parser.parse_args()

    posdict = dict(read_pos_file(args.posfile))

    out_1err = open(args.readsfile + '.err1', 'w')
    out_2err = open(args.readsfile + '.err2', 'w')
    out_3err = open(args.readsfile + '.err3', 'w')
    out_4err = open(args.readsfile + '.err4', 'w')

    for n, read in enumerate(screed.open(args.readsfile)):
        errors = posdict.get(read.name, [])
        if errors:
            if len(errors) == 1:
                out_1err.write(output_single(read))
            elif len(errors) == 2:
                out_2err.write(output_single(read))
            elif len(errors) == 3:
                out_3err.write(output_single(read))
            elif len(errors) >= 4:
                out_4err.write(output_single(read))

if __name__ == '__main__':
    main()

## summarize-pos-file.py
#! /usr/bin/env python
import sys
import argparse
import screed


def output_single(read):
    name = read.name
    sequence = read.sequence

    accuracy = None
    if hasattr(read, 'accuracy'):
        accuracy = read.accuracy
        assert len(sequence) == len(accuracy), (sequence, accuracy)
        return "@%s\n%s\n+\n%s\n" % (name, sequence, accuracy)
    else:
        return ">%s\n%s\n" % (name, sequence)


def read_pos_file(filename):
    for line in open(filename):
        line = line.strip()
        try:
            read, posns = line.split(' ', 2)
            posns = map(int, posns.split(','))
        except ValueError:
            read = line
            posns = []

        yield read, posns

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('posfile')
    parser.add_argument('reads')
    parser.add_argument('--limitreads', default=None)
    parser.add_argument('--save-erroneous-to', dest='save_err_to',
                        help="output erroneous reads to this file",
                        type=argparse.FileType('w'), default=None)
    args = parser.parse_args()

    print 'reading files...', args.posfile, args.reads
    posdict = dict(read_pos_file(args.posfile))

    limitnames = None
    if args.limitreads:
        limitnames = set([ readname for readname, _ in \
                           read_pos_file(args.limitreads) ])

    all_reads = 0
    sum_bp = 0

    print 'reading sequences...'
    for n, record in enumerate(screed.open(args.reads)):
        if n % 100000 == 0:
            print >>sys.stderr, '...', n

        if args.limitreads and record.name not in limitnames:
            continue

        all_reads += 1
        sum_bp += len(record.sequence)

    print 'done!'

    n_reads = 0
    n = 0
    m = 0
    skipped = 0
    for k, v in posdict.iteritems():
        if args.limitreads and k not in limitnames:
            skipped += 1
            continue

        n_reads += 1

        if not v:
            continue

        n += 1
        m += len(v)

    if args.save_err_to:
        for nn, record in enumerate(screed.open(args.reads)):
            if nn % 100000 == 0:
                print >>sys.stderr, '... x save', nn
            if posdict.get(record.name):
                args.save_err_to.write(output_single(record))

    print 'XXX', all_reads, n_reads

    print 'posfile %s: %d mutated reads of %d; %d mutations total' % \
          (args.posfile, n, n_reads, m)
    print 'skipped:', skipped
    print '%d bp total' % (sum_bp,)
    print 'overall error rate: %f%%' % (100. * m / float(sum_bp))


if __name__ == '__main__':
    main()
	*.bt2
	*.sam
	*.pos
	*.fq
	*.kh
	*.kh.info
	*.fai
	*~
	*.fq.gz
	*-reads.fa
	*.ct
	*.ct.info
	*.keep
	*.pyc
	#! /usr/bin/env python
	"""
	Compare two lists of error positions ('pos files').

	first file is assumed to be prediction, second file is assumed to be "truth."
	"""
	import argparse
	import sys
	import screed

	def do_counting(first, second, ignore_set):
	n = 0 # reads w/errors in first
	o = 0 # reads w/errors in second

	matches = 0 # exact matches
	mismatches = 0 #
	n_ignored = 0 # intentionally ignored

	for k, va in first.iteritems():
	if k in ignore_set: # ignoreme
	n_ignored += 1
	continue

	vb = second.get(k)
	if not va and not vb: # no errors! TN.
	continue

	# these conditions are handled elsewhere
	if va and not vb: # part of fp; see incorr_in_a intersect corr_in_b
	continue
	if vb and not va: # fn; see corr_in_a intersect incorr in b
	continue

	n += 1
	if va == vb: # exact match!
	matches += 1
	else:
	mismatches += 1 # no match :(

	for k, vb in second.iteritems():
	if k in ignore_set: # ignoreme
	continue

	if len(vb):
	o += 1

	return n, o, matches, mismatches, n_ignored


	# read in list of error positions per read
	def read_pos_file(filename, ignore_set):
	for line in open(filename):
	line = line.strip()
	try:
	read, posns = line.split(' ', 2)
	except ValueError:
	read = line
	posns = []

	if read in ignore_set:
	posns = []
	elif posns:
	if posns is 'V': # ignore variable coverage foo
	ignore_set.add(read)
	posns = []
	else:
	posns = list(sorted(map(int, posns.split(','))))

	yield read, posns


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-V', '--variable', default=False, action='store_true')
	parser.add_argument('-d', '--debug', default=False, action='store_true')
	parser.add_argument('pos_a')
	parser.add_argument('pos_b')
	parser.add_argument('reads')
	args = parser.parse_args()

	ignore_set = set() # reads to ignore because of low coverage

	# read in two lists of positions & errors
	print >>sys.stderr, 'loading posfiles'
	a = dict(read_pos_file(args.pos_a, ignore_set))
	b = dict(read_pos_file(args.pos_b, ignore_set))

	if not args.variable: # eliminate ignore_set
	ignore_set = set()

	# get list of all reads
	print >>sys.stderr, 'loading reads from', args.reads
	all_names = set()

	total_reads = 0
	for record in screed.open(args.reads):
	total_reads += 1
	if record.name not in ignore_set:
	all_names.add(record.name)

	print >>sys.stderr, 'done!'

	# reads we pay attention to
	tracked_reads = len(all_names)
	n_ignored = total_reads - tracked_reads

	n, o, matches, mismatches, n_ignoredXX = do_counting(a, b, ignore_set)

	print 'total # of reads: %d' % (total_reads,)
	print 'IGNORED due to -V: %d (%d, %.2f%%)' % (n_ignored,
	tracked_reads,
	tracked_reads / float(total_reads) * 100.)
	print 'total # of tracked reads:', tracked_reads

	print '%d erroneous reads in %s' % (n, args.pos_a)
	print '%d erroneous reads in %s' % (o, args.pos_b)
	print '%d reads in common => all error positions AGREE' % (matches,)
	print '%d DISAGREE' % (mismatches,)

	# assume a is prediction, b is correct

	incorrect_in_a = set([ k for k in a if a[k] ])
	incorrect_in_b = set([ k for k in b if b[k] ])
	correct_in_a = all_names - set([ k for k in a if a[k] ])
	correct_in_b = all_names - set([ k for k in b if b[k] ])

	if args.debug:
	print 'DDD incorrect in a:', len(incorrect_in_a)
	print 'DDD incorrect in b:', len(incorrect_in_b)
	print 'DDD correct in a:', len(correct_in_a)
	print 'DDD correct in b:', len(correct_in_b)

	# m is reads thought to be erroneous by a, agree exactly with b
	tp = matches

	# fp: reads thought to be erroneous by a, but actually correct (in b)
	# + places where a called errors that were incorrect
	fp = mismatches + len(incorrect_in_a.intersection(correct_in_b))

	if args.debug:
	print 'DDD incorrect in a, correct in b:', \
	incorrect_in_a.intersection(correct_in_b)

	# fn: reads through to be correct by a, but actually erroneous (in b)
	fn = len(correct_in_a.intersection(incorrect_in_b))

	# tn: reads thought to be correct in both a and b
	tn = len(correct_in_a.intersection(correct_in_b))

	print 'TP:', tp
	print 'TN:', tn
	print 'FP: %d (%d + %d)' % (fp,
	mismatches,
	len(incorrect_in_a.intersection(correct_in_b)))
	print 'FN:', fn

	print 'sensitivity:', tp / float(tp + fn)
	print 'specificity:', tp / float(tn + fp)

	assert n <= total_reads
	assert o <= total_reads
	assert n_ignored == n_ignoredXX

	assert len(all_names) == tp + tn + fp + fn, \
	(len(all_names) - (tp+tn+fp+fn),
	len(all_names),
	tp,
	tn,
	fp,
	fn)

	if __name__ == '__main__':
	main()
	import sys
	import string
	import array
	import khmer
	import screed

	__complementTranslation = string.maketrans('ACTGactg-=Nn', 'TGACtgac-=Nn')


	REGIONSIZE = 50


	def reverse_complement(s):
	"""
	Build reverse complement of 's', alignment-aware.
	"""
	r = array.array('c', s)
	r.reverse()
	r = string.join(r, '')
	c = string.translate(r, __complementTranslation)

	return c


	class GraphAlignment(object):
	def __init__(self, g, r):
	assert len(g) == len(r), (g, len(g), r, len(r))
	self.g = g.upper()
	self.r = r.upper()

	def reverse_complement(self):
	return GraphAlignment(reverse_complement(self.g),
	reverse_complement(self.r))

	rc = reverse_complement

	def __add__(self, other):
	return GraphAlignment(self.g + other.g, self.r + other.r)

	def __len__(self):
	return len(self.g)

	def refseqlen(self):
	r = self.r
	return r.count('G') + r.count('C') + r.count('T') + r.count('A')

	def __getitem__(self, i):
	if isinstance(i, slice):
	start, stop, step = i.indices(len(self.g))
	assert step == 1
	return GraphAlignment(self.g[start:stop], self.r[start:stop])

	return (self.g[i], self.r[i])

	def compare(self):
	n = len(self.g)
	m = n

	for n, a, b in self.mismatches():
	m -= 1

	return m, n

	def mismatches(self):
	for n, (a, b) in enumerate(zip(self.g, self.r)):
	if (a in '-=' or b in '-=' or a != b):
	yield n, a, b

	def __str__(self):
	line1 = []
	line2 = []
	line3 = []

	for (a, b) in zip(self.g, self.r):
	line1.append(a)
	line3.append(b)

	if a in '-=' or b in '-=' or a != b:
	line2.append(' ')
	else:
	line2.append('\|')

	return "%s\n%s\n%s\n" % ("".join(line1), "".join(line2), "".join(line3))


	def make_gap(r):
	return GraphAlignment('='*len(r), r)


	def _index_alignment(galign, freq=100):
	g_to_r = {}
	r_to_g = {}

	si = 0
	for gi, (a, b) in enumerate(zip(galign.g, galign.r)):
	if gi % freq == 0:
	g_to_r[gi] = si
	if si % freq == 0:
	r_to_g[si] = gi

	if b in 'ACGT':
	si += 1

	return g_to_r, r_to_g


	class AlignmentIndex(object):
	def __init__(self, galign, freq=100):
	self.g_to_r, self.r_to_g = _index_alignment(galign, int(freq))
	self.galign = galign
	self.freq = int(freq)

	def get_gi(self, ri):
	"Return alignment coordinates cooresponding to reference seq coords."
	rpost = int(ri / self.freq) * self.freq
	gpost = self.r_to_g[rpost]

	diff = ri - rpost
	gi = gpost

	while diff > 0:
	(a, b) = self.galign[gi]
	if b in 'ACGT':
	diff -= 1
	gi += 1

	# make sure it's on a valid letter ;)
	while 1:
	(a, b) = self.galign[gi]
	if b in 'ACGT':
	break
	gi += 1

	return gi

	def get_ri(self, gi):
	"Return reference sequence coordinates from alignment coordinates."
	gpost = int(gi / self.freq) * self.freq
	ri = self.g_to_r[gpost]

	diff = gi - gpost
	while diff > 0:
	(a, b) = self.galign[gpost]
	diff -= 1
	gpost += 1
	if b in 'ACGT':
	ri += 1

	return ri

	def _sanityCheck(self, seq):
	alignment = self.galign

	for gi in range(len(alignment)):
	if alignment[gi][1] in 'ACGT':
	ri = self.get_ri(gi)
	assert alignment[gi][1] == seq[ri]

	for ri in range(len(seq)):
	gi = self.get_gi(ri)
	assert seq[ri] == alignment[gi][1]


	def stitch(galign, K):
	"""Stitch together multiple alignments, assuming all overlaps are
	off-by-one on the right end."""
	ga = []
	ra = []

	len_so_far = 0

	n = 0
	for gg in galign[:-1]:
	g = gg.g
	r = gg.r
	assert len(g) == len(r)

	ga.append(g[:-K + 1])
	ra.append(r[:-K + 1])

	n += 1

	ga.append(galign[-1].g)
	ra.append(galign[-1].r)

	return GraphAlignment("".join(ga), "".join(ra))


	def align_segment_right(aligner, seq, next_ch=None):
	score, g, r, truncated = aligner.align(seq)
	galign = GraphAlignment(g, r)

	# did it fail to align across entire segment?
	if truncated:
	aligned_length = galign.refseqlen()
	unaligned_len = len(seq) - aligned_length

	# if we are given next ch, try aligning backwards from next seed
	if next_ch:
	unaligned_seq = seq[-unaligned_len:]
	sr, ggr = align_segment_left(aligner, unaligned_seq + next_ch)
	galign += ggr[:-1]
	score += sr # need to adjust score down... @@
	else:
	# if not, just build a gap...
	galign += make_gap(seq[-unaligned_len:])

	assert galign.refseqlen() == len(seq)

	return score, galign


	def align_segment_left(aligner, seq):
	seq_rc = reverse_complement(seq)
	score, galign = align_segment_right(aligner, seq_rc)
	return score, galign.rc()


	def align_long(ct, aligner, sta):
	K = ct.ksize()

	# first, pick seeds for each chunk
	seeds = []
	for start in range(0, len(sta), REGIONSIZE):
	region = sta[start:start + REGIONSIZE + K - 1]
	if len(region) < K:
	assert len(sta) - start < REGIONSIZE
	break
	seed_pos = find_highest_abund_kmer(ct, region)
	seeds.append(start + seed_pos)

	assert len(seeds) == len(set(seeds))

	# then, break between-seed intervals down into regions, starting at
	# first seed.
	region_coords = []
	for i in range(len(seeds) - 1):
	seed_pos = seeds[i]
	end_seed = seeds[i + 1] - 1
	region_coords.append((seed_pos, end_seed + K))

	# account for situation where last region is too small to align.
	if len(sta) - seeds[-1] > K:
	region_coords.append((seeds[-1], len(sta)))
	else:
	(last_seed, _) = region_coords.pop()
	region_coords.append((last_seed, len(sta)))

	# start building piecewise alignments, anchored by seeds.
	alignments = []
	scores = []

	n = 0
	for (start, end) in region_coords[:-1]:
	score, galign = align_segment_right(aligner, sta[start:end],
	next_ch=sta[end])
	scores.append(score)
	alignments.append(galign)

	n += 1

	# deal with end (no possibility of alignment from right)
	(start, end) = region_coords[-1]
	score, galign = align_segment_right(aligner, sta[start:end])

	alignments.append(galign)
	scores.append(score)

	# deal with beginning, too: reverse align from first seed.
	leftend = sta[0:seeds[0] + K]
	score, galign = align_segment_left(aligner, leftend)

	galign = galign[:-1] # trim off seed k-mer
	alignments.insert(0, galign)
	scores.insert(0, score)

	# stitch all the alignments together
	final = stitch(alignments, K)

	return sum(scores), final


	def find_highest_abund_kmer(ct, r):

	pos = 0
	abund = ct.get(r[:ct.ksize()])
	for kstart in range(1, len(r) - ct.ksize() + 1):
	kmer = r[kstart:kstart + ct.ksize()]
	count = ct.get(kmer)

	if count > abund:
	pos = kstart

	return pos


	def test_1():
	ct = khmer.new_counting_hash(20, 1.1e6, 4)
	ct.consume_fasta('simple-haplo-reads.fa.keep')
	#ct = khmer.load_counting_hash('simple-haplo-reads.dn.ct')
	aligner = khmer.new_readaligner(ct, 5, 1.0)

	seq = "".join("""GTCCTGGCGGTCCCCATTCA
	CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
	GTAGGAAGTCACATGTGCTAAATATCAG
	TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
	GCACAGTTTTTTATACGAAACCGTCTGCGTCAAGACGGGCCACATGGT
	""".strip().split())

	score, alignment = align_long(ct, aligner, seq)

	print len(seq), alignment.refseqlen()

	for start in range(0, len(alignment), 60):
	print alignment[start:start+60]


	gidx = AlignmentIndex(alignment)
	gidx._sanityCheck(seq)


	def test_2():
	ct = khmer.new_counting_hash(20, 1.1e6, 4)
	ct.consume_fasta('simple-haplo-reads.fa.keep')
	aligner = khmer.new_readaligner(ct, 5, 1.0)

	seq = "".join("""GTCCTGGCGGTCCCCATTCA
	CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
	GTAGGAAGTCACATGTGCTAAATATCAG
	TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTAT
	""".strip().split())

	seq = list(screed.open('simplefoo.fa'))[0].sequence

	score, alignment = align_long(ct, aligner, seq)

	print len(seq), alignment.refseqlen()

	for start in range(0, len(alignment), 60):
	print alignment[start:start+60]

	gidx = AlignmentIndex(alignment)
	gidx._sanityCheck(seq)
	# evaluate against perfect?
	QUAKE=/Users/t/Documents/papers/2014-streaming/Quake

	all: ref-errors.details.txt corr-errors.details.txt

	display: compare.txt refcompare.txt

	ecoli-subset.fq: ecoli-mapped.fq.gz
	sample-reads-randomly.py -R 1 -N 10000 ecoli-mapped.fq.gz -o ecoli-subset.fq

	ecoli-subset.100k.fq: ecoli-mapped.fq.gz
	sample-reads-randomly.py -R 2 -N 100000 ecoli-mapped.fq.gz -o ecoli-subset.100k.fq

	ecoli-subset.100k.cor.fq.gz: ecoli-subset.100k.fq
	echo ecoli-subset.100k.fq > ecoli_quake_list.txt
	#ln -fs /usr/local/bin/jellyfish .
	#cat ecoli-mapped.fq.gz.keep \| $(QUAKE)/bin/count-qmers -q 33 -k 14 > ecoli_dn_counts.out
	#$(QUAKE)/bin/cov_model.py ecoli_dn_counts.out > ecoli_dn_counts.cov
	$(QUAKE)/bin/correct -f ecoli_quake_list.txt -p 4 -k 14 -q 33 -c 7.94 -z -m ecoli_dn_counts.out
	#mv ecoli-mapped.cor.fq.gz ecoli-mapped.dn.cor.fq.gz

	ecoli-subset.100k.quake.fq: ecoli-subset.100k.cor.fq.gz ecoli-subset.100k.fq
	extract-original-reads-from-quake-cor.py ecoli-subset.100k.fq ecoli-subset.100k.cor.fq.gz ecoli-subset.100k.quake.fq


	original.sam: ecoli-subset.fq
	cat ecoli-subset.fq \| bowtie2 -p 4 -x ecoli -U - -S original.sam

	original.sam.pos: original.sam
	./sam-scan.py ecoliMG1655.fa original.sam -o original.sam.pos

	###

	ecoli.dn.k13.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 13 -x 4e7 ecoli.dn.k13.kh ecoli-mapped.fq.gz.keep.gz

	corr.k13.C17.fq: ecoli.dn.k13.kh ecoli-subset.fq
	../sandbox/error-correct-pass2.py ecoli.dn.k13.kh ecoli-subset.fq -o corr.k13.C17.fq --trusted-cov 17

	corr.k13.C17.sam: corr.k13.C17.fq
	cat corr.k13.C17.fq \| bowtie2 -p 4 -x ecoli -U - -S corr.k13.C17.sam

	corr.k13.C17.sam.pos: corr.k13.C17.sam
	./sam-scan.py ecoliMG1655.fa corr.k13.C17.sam -o corr.k13.C17.sam.pos

	###

	ecoli.dn.k15.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 15 -x 4e7 ecoli.dn.k15.kh ecoli-mapped.fq.gz.keep.gz

	corr.k15.C15.fq: ecoli.dn.k15.kh ecoli-subset.fq
	../sandbox/error-correct-pass2.py ecoli.dn.k15.kh ecoli-subset.fq -o corr.k15.C15.fq --trusted-cov 15

	corr.k15.C15.sam: corr.k15.C15.fq
	cat corr.k15.C15.fq \| bowtie2 -p 4 -x ecoli -U - -S corr.k15.C15.sam

	corr.k15.C15.sam.pos: corr.k15.C15.sam
	./sam-scan.py ecoliMG1655.fa corr.k15.C15.sam -o corr.k15.C15.sam.pos

	###

	ecoli.dn.k17.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 17 -x 4e7 ecoli.dn.k17.kh ecoli-mapped.fq.gz.keep.gz

	corr.k17.C15.fq: ecoli.dn.k17.kh ecoli-subset.fq
	../sandbox/error-correct-pass2.py ecoli.dn.k17.kh ecoli-subset.fq -o corr.k17.C15.fq --trusted-cov 15

	corr.k17.C15.sam: corr.k17.C15.fq
	cat corr.k17.C15.fq \| bowtie2 -p 4 -x ecoli -U - -S corr.k17.C15.sam

	corr.k17.C15.sam.pos: corr.k17.C15.sam
	./sam-scan.py ecoliMG1655.fa corr.k17.C15.sam -o corr.k17.C15.sam.pos

	###

	ecoli.dn.k21.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 21 -x 8e7 ecoli.dn.k21.kh ecoli-mapped.fq.gz.keep.gz

	ecoli.dn.k23.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 23 -x 8e7 ecoli.dn.k23.kh ecoli-mapped.fq.gz.keep.gz

	ecoli.dn.k31.kh: ecoli-mapped.fq.gz.keep.gz
	load-into-counting.py -k 31 -x 8e7 ecoli.dn.k31.kh ecoli-mapped.fq.gz.keep.gz

	corr.k21.C5.fq: ecoli.dn.k21.kh ecoli-subset.fq
	../sandbox/error-correct-pass2.py ecoli.dn.k21.kh ecoli-subset.fq -o corr.k21.C5.fq --trusted-cov 5

	corr.k21.C5.sam: corr.k21.C5.fq
	cat corr.k21.C5.fq \| bowtie2 -p 4 -x ecoli -U - -S corr.k21.C5.sam

	corr.k21.C5.sam.pos: corr.k21.C5.sam
	./sam-scan.py ecoliMG1655.fa corr.k21.C5.sam -o corr.k21.C5.sam.pos

	ecoli-ref.dn.k21.kh: ecoliMG1655.fa
	load-into-counting.py -k 21 -x 1e7 ecoli-ref.dn.k21.kh ecoliMG1655.fa

	refcorr.k21.fq: ecoli-ref.dn.k21.kh
	../sandbox/error-correct-pass2.py ecoli-ref.dn.k21.kh ecoli-subset.fq -o refcorr.k21.fq --trusted-cov=1

	refcorr.k21.sam: refcorr.k21.fq
	cat refcorr.k21.fq \| bowtie2 -p 4 -x ecoli -U - -S refcorr.k21.sam

	refcorr.k21.sam.pos: refcorr.k21.sam
	./sam-scan.py ecoliMG1655.fa refcorr.k21.sam -o refcorr.k21.sam.pos

	###

	ref-errors.sam: ref-errors.fq
	bowtie2 -p 4 -x ecoli -U ref-errors.fq -S ref-errors.sam

	ref-errors.details.txt: ref-errors.sam
	./sam-scan-details.py ecoliMG1655.fa ref-errors.sam ecoli-ref.dn.k21.kh -o ref-errors.details.txt -C 1

	corr-errors.sam: corr-errors.fq
	bowtie2 -p 4 -x ecoli -U corr-errors.fq -S corr-errors.sam

	corr-errors.details.txt: corr-errors.sam
	./sam-scan-details.py ecoliMG1655.fa corr-errors.sam ecoli.dn.k21.kh -o corr-errors.details.txt -C 5

	###

	test-set.sam: test-set.fq
	bowtie2 -p 4 -x ecoli -U test-set.fq -S test-set.sam

	test-set.sam.pos: test-set.sam
	./sam-scan.py ecoliMG1655.fa test-set.sam -o test-set.sam.pos

	test-set-report: test-set.sam.pos
	./summarize-pos-file.py test-set.sam.pos test-set.fq

	###


	compare.txt: original.sam.pos corr.k13.C17.sam.pos corr.k15.C15.sam.pos \
	corr.k17.C15.sam.pos corr.k21.C5.sam.pos
	./summarize-pos-file.py original.sam.pos ecoli-subset.fq
	./summarize-pos-file.py corr.k13.C17.sam.pos corr.k13.C17.fq
	./summarize-pos-file.py corr.k15.C15.sam.pos corr.k15.C15.fq
	./summarize-pos-file.py corr.k17.C15.sam.pos corr.k17.C15.fq
	./summarize-pos-file.py corr.k21.C5.sam.pos corr.k21.C5.fq --save-erroneous-to=corr-errors.fq

	refcompare.txt: refcorr.k21.sam.pos
	./summarize-pos-file.py original.sam.pos ecoli-subset.fq
	./summarize-pos-file.py refcorr.k21.sam.pos refcorr.k21.fq --save-erroneous-to=ref-errors.fq

	ecoli.1.bt2: ecoliMG1655.fa
	bowtie2-build ecoliMG1655.fa ecoli
	samtools faidx ecoliMG1655.fa

	#####

	NULLGRAPH=~/dev/nullgraph

	simple-haplo-reads.fa: simple-haplo.fa
	$(NULLGRAPH)/make-reads.py -C 100 -S 1 simple-haplo.fa > simple-haplo-reads.fa

	simple-reads.fa: simple.fa
	$(NULLGRAPH)/make-biased-reads.py -S 1 -C 100 simple.fa > simple-reads.fa

	simple-reads.ct: simple-reads.fa
	load-into-counting.py -k 20 -x 1.1e6 simple-reads.ct simple-reads.fa

	simple-haplo-reads.dn.ct: simple-haplo-reads.fa
	normalize-by-median.py -k 20 -C 20 simple-haplo-reads.fa -s simple-haplo-reads.dn.ct -x 1.1e6

	ex1: simple-haplo-reads.dn.ct
	find-variant-by-align.py simple-haplo-reads.dn.ct simple-alt.fa

	ex2: simple-reads.ct
	count-by-align.py simple-reads.ct simple-haplo.fa > simple-haplo.counts
	count-by-align.py simple-reads.ct simple-alt.fa > simple-alt.counts

	ex3:simple-haplo-reads.dn.ct
	find-variant-by-align.py simple-haplo-reads.dn.ct simple-long.fa

	ex4:simple-haplo-reads.dn.ct
	find-variant-by-align-long.py simple-haplo-reads.dn.ct simple-haplo.fa

	ex5:simple-haplo-reads.dn.ct
	find-variant-by-align-long.py simple-haplo-reads.dn.ct simple-long.fa
	>genome2
	CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
	>genome1
	CTGCCATTGCCCCAACATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTCATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
	>genome3
	GTCCTGGCGGTCCCCATTCA
	CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
	GTAGGAAGTCACATGTGCTAAATATCAG
	TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
	GCACAGTTTTTTATACGAAACCGTCTGCGTCAAGACGGGCCACATGGT
	>genome 2
	CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA
	>genome2 1.5
	CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCTTCATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTATATTGATCTAACTTAATGTCGTAGTGTGTTGCAAATGCTCGAGAGCGTGATGGCGGTTCGGACATTGGAAATACCCACGCACTCAAGTACGTAGAAGCA