Skip to content

Instantly share code, notes, and snippets.

@ctb
Last active August 29, 2015 14:06
Show Gist options
  • Save ctb/a5ab6664e789c6ec1897 to your computer and use it in GitHub Desktop.
Save ctb/a5ab6664e789c6ec1897 to your computer and use it in GitHub Desktop.
import khmer
import screed
import random
K = 20
# read in genome reference
genome = list(screed.open('genome.fa'))[0].sequence
# build a counting hash and a read aligner on top of that
ht = khmer.new_counting_hash(K, 1e6, 4)
aligner = khmer.new_readaligner(ht, 1, 1)
# consume the reference sequence, add tags, & return positions of those
# tags
tags = ht.consume_and_tag_with_positions(genome)
# build a tag->list of positions dictionary. (with repetitive sequence,
# tags may have multiple locations in the reference sequence)
tags_to_pos = {}
for p, t in tags:
x = tags_to_pos.get(t, [])
x.append(p)
tags_to_pos[t] = x
def min_distance_to_tag(pos, x):
y = [ abs(pos - ipos) for ipos in x ]
return min(y)
# do a little "simulation" - get exact subsequences, make mutations,
# align, retrieve positions by tag.
for i in range(10):
pos = random.randint(0, len(genome) - 100)
seq = genome[pos:pos+100]
do_rc = random.choice([0, 1])
if do_rc:
seq = screed.rc(seq)
print '---'
for p, t in ht.retrieve_tags(seq):
x = tags_to_pos[t]
print ' '*5, tags_to_pos.get(t), p, min_distance_to_tag(pos, x)
#assert min_distance_to_tag(pos, x) < 100, min_distance_to_tag(pos, x)
# make a mutated sequence
mpos = random.randint(0, 99)
mseq = seq[:mpos] + random.choice(['a', 'c', 'g', 't']) + seq[mpos + 1:]
# align that sucker
score, graph_a, read_a, trunc = aligner.align(mseq)
graph_a = graph_a.replace('-', '')
print ''
print graph_a
print read_a
mut_loc = ''
for a, b in zip(graph_a, read_a):
if a != b:
mut_loc += '*'
else:
mut_loc += '.'
print mut_loc
# compare.
for p, t in ht.retrieve_tags(graph_a):
x = tags_to_pos[t]
print ' '*5, tags_to_pos.get(t), p, min_distance_to_tag(pos, x)
#assert min_distance_to_tag(pos, x) < 100, min_distance_to_tag(pos, x)
>genome
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG
TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGAAGGTCATTCACACGCAGCGTCATTTAATGGATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTTCATCTGGACTTGTACGTGCGACAGCTCCTTCCATTTCCGCCTTGCCATACAGACCACCTAAGACCGCAGACCCTCCTCCTTACCACATGCGATGCGTGGGAACCGGTGTCAAAGACGGGTGCCGCTACACAGGAAGGCACCCAGGGAAAGTCGTTTGCCGGAAGAGAGTGGAGCTCCTACGTAAACGGGGAAACCACTTGTTTGGATTCCCCCTTGCCGATTCGGCCCTATCAGGATGTATTTAACTTAGGAGAAACCGAACAACTGCCACCGCTTATTGCCCCGGCAGGCGGTAGTTTCCACGATCTAACAATCGAAGCAATTCGGACAGGCTTAAGCTACAAAGCTCGGATTTTGTAAGTGCTCTATCCTTTGTAGGAAGTGAAAGATGACGTTGCGGCCGTCGCTGTTGGAGGAACCGCAGCACCATGGCGCCTGTGCGAGCTGGAGATCCTCTCATAGCGTCAGAGCACGGGATGCTGTATATTAAGCACACAATAGCCCGGGGACCGGCCCCAACGTGAAATGCCTGGCCTGCCGTTCTTTATAGTGCTCGTGATAGTGTTATAAAGGAACTAACATCAAGTTATGTAAGGACTTTTACAATAGCGTGGTCCGTCAAGTCGTCCACGTGTGTAAATTCATTGGTACCTTTTGCCGAAAAATTTGAAAGCTAAGCACATTCTGCTTACTCACAGGGTAAGTTCCTGAAGTATTAATGTAATGTGGAAAGACAGGCATATGAACACTATTGGGCTTTGTAGACATTCCTCATCCATGCTGTATCAGTAATGTACAATTCGCCCCTTTCGTAAAGGAGAGCCGTGCTAACGTTATATTCGGTCTTACCACGGGCTCGATAGTTTGCCCC
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG
TTTATAGGAACTCCCCGACAAACACACCCTGTTTGCGCAGTGGGATTACGTAAATTGGAGACGACGGCCGCTACCATTGTCTTGTTCGTTGGAGCATAGCATTACGCCATAGCAGTGAGCTTAATTATCGGGCACTAAGGCTGTCGAAACAGAGACGGCGTACGGACGCGGTCTTACCGATGCAAGAGCGCTCCTCATCATGAGCGGTACTAACATCTAAGGTTGGGCGACCAGCTAAAATCGCCTCAATCCTTAGGAGCCAAACGATCAACCTTTAGAGGTCCGGTTAGCAATTTAGGCGGACACCGGATCGTCAACAGCTAGGAGATTTTGCAATACACACCATCCGCGAGACACGACAAACCTAGTGGTTCTGCAGCATCTCTAAGTCGCCTCCGTCGCCAGGCTAGAGTCGACGTTACGTACGTCAACTGTAGCAAAAAGTGCTTGGTTCCCAAATTCATTATCTTTGATCACGGGATACCAGAGGATACGAGGGAAAACTCAGTTCCGGTAAAAAACTTTTCGATGTTGCCCCACATCGTGTGTCTCACGCAGCCGGAGTGCCAAGGAAATCAGGGTAATATTCGGAGGACTGACAGTGCGGGGGATTATTTGGTTCCACACTCCCGGTGGGCCGATATGAAGCGTGCCGTTCCCTTGCGTCTCGTTTCGTCTCCCGGTCCCGTTCTTGTGCCTGTTCTGAAACTAATCGAAGTCCGTGTCTTAACCAGTGTAGATCTTGTGGTCTACGAGTGCTCTGTTCAAGGTGATAAACATCTCGCTCTAAAACAATAATACACATCTCGCCAATCGAGATCTGCCGTGGAGTGTAGCCGAAAGAAGATATTTCGAGTGGTGCCAGATCACCCAGACTTAGTTGTCGGTTTTCCCCCGCGAGGACGGGATCATGTATGGGGATTCTTTTACATCATCAAAACCCGCCTCGGATGTGTCCGCTCTGTTGGATTGGAGCTCCTTACCTAAGATTGGGAAAACA
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG
CGGTAAGGCAATCCTGATCATGGTGGGTTAACGGAATGGTGAGAGCCTACATCGTGCACCTTGAGAATAATGGTCGGCTTCCCATTAGGTGCATGAGCCTAGGCCGCGGGATGTGGCGGTGTCGAGCACGACCATAAATTACCAGATTGATGCTTCGGCGCGCAATGGAGAAATGGCACACTCAGGAGTCCTGCTTTGACGCGTGACTGGTTATGGTCAAGCCACCATGGCTTTTATGTGGGCTTGCTTGACGGAGACTATCGCATAGAGCCGCTGAAAGTCCATCATAAGACGCATCAACACTGTGGGGCATCTAAACCCCATGCCATTTGCTTTAATTGACGAGTTAGATCTTAGAATTATCACTTTCGCGCTATCCTGTGAGGCAGCCTACACTGAGGACGCACCTAGCTAGGCACCGCTATGGGCCATGATTTTGGTGCCTGCGCTGCTTCCATGGCTGGCTCAGAAGATTCCCCTTGCATTGCCGCTAACAGACCACGGAAGGGACCGTGGGGCTTCCCTAATCTGGAGTCCTCGAGCAGAACCTACACTTAATAGGTAGGGATCAACGTTGGTCCGCACTAAACTAATGGACCGTGCCAACAGGAGTTCTTATTCGCTCATGAAGGTCTAGACTAGGCCGAGAAATGTGCTAAAACCAGGACACCCGTACTTTGAAATTCATTATGATCCATCATTGAGTCTGCCTGAAGTATGAAGAGAACCACATTTCAACTTACAGCCCAAGATCTCGACGGATTTGGATGCATGATTCTATCTAAGATACATCGTTTCGCCCTCGCTATTCACAATTGTACGTCATGTAGTATATCGACTCACCCGCGCGCTTCCGGATAACACCTGCACTGATGTGAGAAACGGTCTGCATTGGCTCATATGTTATCGTTAAACTTTGCACGTAAAAAGGCTGTCGACCTAGACTCTGTTAGCAAAACCACGTCGCCAACCTTCCGGCGTGGAACCCGGAGTGGTCGTG
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG
TCAGTCAGTGTAGCCCTGGCGCGTACACGATGCCGCTGTTGAGCTGCGGGCATGAGTACTCGCGCTGAAACTCCCAATCATCCTTACGCCTACCAGCCTACAATAAGAGAGTTCTATAGGAACGGCCGCTCTGGCTAATGTCTATTTCTAAGTTACCTTATCAGGTGAGCCATGCCACCCGTAAACCCTTGTCGTGTGCGTTTTATTCTTAATAAACTCACCGTGACCGTGAAGTTCTATTCGGAGTCCTCCCGAGGGCATACGCTGCGAGCGGTGTGTCACTCCACACGGCGCACGTGAGCGCTCTATATTTCCCAGGACCGTACGCTAAGTGGACGTTCAGATAGGATGAGTGAGACGCAGGTGGGATCACTTAGAATTAGGTGCCACTGTTAAGGCCACGTGGCGTTGGTGTGAATGAGTCGCATACTATACATTCTTGTAGGATTGTAATTGATCCTGCCCCAAAATGGTTGCATGGAAGCGAATTAACGGAAAACTAGTCTAGACCTCTTGCGGGTCTCTCTTGACGTGGCAATGGAACCAGACAACGCATCAGAGAAATCTATACATCCATCCCACGTTTCTTGGTATAGTCTGATGTTATTTGCTGGACCACCTTCCCAATTAGGGAGAGTATAGCTACGTGTATAATAAAAGCTTTTCCTTGCGAACGGATTGGGACGTACACATGAGTTAAACATGGCGAAGCTCCTTCCACCCGCCCAGGGAGGGACTGCCCTTAGTCGTTCAGGCGATTAGGGTCCTAATACGTTTAACAACGCTCAGCGCTGGTAAATCGAGGGAAGCAGTGAATCCCCCCGGACGCCCCCGGCGGCTAGCATGAGGCGCGATTCCGCTGTGTCCAACTTTCCGGAAAATGAAAAGAAGTCGGCACTGCCGAAGACAACTCTTTCCGAAGGTATGATAAGGGCCTAGCACTGATACTCTAGTTAAGAACATAGGAACTATGCACCTGTGCATGTACGTTTCATCCTTA
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG
GCCCCAACGAGCCTCGTGCGTTGATAATACCAATATGATCAAGGCCAGTCTAGGTTGATTACTTCGGAACACAGCGGACCCAGTGACAGAAAGACCCATGTCGAATGCGGGTATTACGATAGAAGTATGCAGTGACTGGTGCCTAATGGAGGAACCCGAAGTGTTCATGAATCCGATCAATCTGACCCGATTCTTGAAGGTACCCATTATCCTACTCGGATTGTACAAGTGGGCTTGCCCAGAATTAAATTAATGTATTTCTTGATCATTAAGAGTGGCTACGGAGAGCCCTTGACTGGAATCTTAGCCCAGCCATGCCTTTGACCCGTAGTCGATACGTAAATCCGTTTAACCTCCTCTTGGAAGTAACCTTTGCAGGCAGGCGACCGCACTTCTTAAACAGGAACCATGCCCTTGCCCGAAGGTCGCGCTAGTTCTGAATCCGTGGCCGCGTTATACGTTTCGAAGATTGTTGAACCCTTAACACGATCTCATCGGCGACTCTCCACGCAATGAGAGTAAGTAGTGGCCCTAGTGGTCTCCCCGGCCGGAGTGCCAAGTGCGTCGAAGTTTGCCTCTTGTCAACATGGGACGGGCAGACCCTAACGCCGAACTCTTCGTGACCTCGGTATTGACGGCCTAAAAATGCTCGTAGCGGGCTGGGCCAATATTGGAGCATGGATTACCTATGCCAGGTGAGGGAAAAACTTATCATCCAAACTGTTACCGAACACCGGAAAGGTTGTCGTAAACGGTGGCTGAACTTGATTGAGCTCGGTGAATCTATCGATTGCTTCTATGGACTGGTAGTGTCCAGCGACTTATCGTCATGGTGTCCCGATGATTTCCACAAGAGTCGATCGGCCTCAATGTGGTGCCTCCTGAATACGAGATTAAATAAGATCCAGAAGCCCTTTGCGGTTAACGCCCGATAACCAGAGCGGCCTTCGATCTCTCCCGAACGCAGAGAGTTGTCTTACCCGAGATTTTTTCAGATGGC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment