Last active
August 29, 2015 14:06
-
-
Save ctb/a5ab6664e789c6ec1897 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import khmer | |
import screed | |
import random | |
K = 20 | |
# read in genome reference | |
genome = list(screed.open('genome.fa'))[0].sequence | |
# build a counting hash and a read aligner on top of that | |
ht = khmer.new_counting_hash(K, 1e6, 4) | |
aligner = khmer.new_readaligner(ht, 1, 1) | |
# consume the reference sequence, add tags, & return positions of those | |
# tags | |
tags = ht.consume_and_tag_with_positions(genome) | |
# build a tag->list of positions dictionary. (with repetitive sequence, | |
# tags may have multiple locations in the reference sequence) | |
tags_to_pos = {} | |
for p, t in tags: | |
x = tags_to_pos.get(t, []) | |
x.append(p) | |
tags_to_pos[t] = x | |
def min_distance_to_tag(pos, x): | |
y = [ abs(pos - ipos) for ipos in x ] | |
return min(y) | |
# do a little "simulation" - get exact subsequences, make mutations, | |
# align, retrieve positions by tag. | |
for i in range(10): | |
pos = random.randint(0, len(genome) - 100) | |
seq = genome[pos:pos+100] | |
do_rc = random.choice([0, 1]) | |
if do_rc: | |
seq = screed.rc(seq) | |
print '---' | |
for p, t in ht.retrieve_tags(seq): | |
x = tags_to_pos[t] | |
print ' '*5, tags_to_pos.get(t), p, min_distance_to_tag(pos, x) | |
#assert min_distance_to_tag(pos, x) < 100, min_distance_to_tag(pos, x) | |
# make a mutated sequence | |
mpos = random.randint(0, 99) | |
mseq = seq[:mpos] + random.choice(['a', 'c', 'g', 't']) + seq[mpos + 1:] | |
# align that sucker | |
score, graph_a, read_a, trunc = aligner.align(mseq) | |
graph_a = graph_a.replace('-', '') | |
print '' | |
print graph_a | |
print read_a | |
mut_loc = '' | |
for a, b in zip(graph_a, read_a): | |
if a != b: | |
mut_loc += '*' | |
else: | |
mut_loc += '.' | |
print mut_loc | |
# compare. | |
for p, t in ht.retrieve_tags(graph_a): | |
x = tags_to_pos[t] | |
print ' '*5, tags_to_pos.get(t), p, min_distance_to_tag(pos, x) | |
#assert min_distance_to_tag(pos, x) < 100, min_distance_to_tag(pos, x) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>genome | |
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG | |
TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGAAGGTCATTCACACGCAGCGTCATTTAATGGATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTTCATCTGGACTTGTACGTGCGACAGCTCCTTCCATTTCCGCCTTGCCATACAGACCACCTAAGACCGCAGACCCTCCTCCTTACCACATGCGATGCGTGGGAACCGGTGTCAAAGACGGGTGCCGCTACACAGGAAGGCACCCAGGGAAAGTCGTTTGCCGGAAGAGAGTGGAGCTCCTACGTAAACGGGGAAACCACTTGTTTGGATTCCCCCTTGCCGATTCGGCCCTATCAGGATGTATTTAACTTAGGAGAAACCGAACAACTGCCACCGCTTATTGCCCCGGCAGGCGGTAGTTTCCACGATCTAACAATCGAAGCAATTCGGACAGGCTTAAGCTACAAAGCTCGGATTTTGTAAGTGCTCTATCCTTTGTAGGAAGTGAAAGATGACGTTGCGGCCGTCGCTGTTGGAGGAACCGCAGCACCATGGCGCCTGTGCGAGCTGGAGATCCTCTCATAGCGTCAGAGCACGGGATGCTGTATATTAAGCACACAATAGCCCGGGGACCGGCCCCAACGTGAAATGCCTGGCCTGCCGTTCTTTATAGTGCTCGTGATAGTGTTATAAAGGAACTAACATCAAGTTATGTAAGGACTTTTACAATAGCGTGGTCCGTCAAGTCGTCCACGTGTGTAAATTCATTGGTACCTTTTGCCGAAAAATTTGAAAGCTAAGCACATTCTGCTTACTCACAGGGTAAGTTCCTGAAGTATTAATGTAATGTGGAAAGACAGGCATATGAACACTATTGGGCTTTGTAGACATTCCTCATCCATGCTGTATCAGTAATGTACAATTCGCCCCTTTCGTAAAGGAGAGCCGTGCTAACGTTATATTCGGTCTTACCACGGGCTCGATAGTTTGCCCC | |
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG | |
TTTATAGGAACTCCCCGACAAACACACCCTGTTTGCGCAGTGGGATTACGTAAATTGGAGACGACGGCCGCTACCATTGTCTTGTTCGTTGGAGCATAGCATTACGCCATAGCAGTGAGCTTAATTATCGGGCACTAAGGCTGTCGAAACAGAGACGGCGTACGGACGCGGTCTTACCGATGCAAGAGCGCTCCTCATCATGAGCGGTACTAACATCTAAGGTTGGGCGACCAGCTAAAATCGCCTCAATCCTTAGGAGCCAAACGATCAACCTTTAGAGGTCCGGTTAGCAATTTAGGCGGACACCGGATCGTCAACAGCTAGGAGATTTTGCAATACACACCATCCGCGAGACACGACAAACCTAGTGGTTCTGCAGCATCTCTAAGTCGCCTCCGTCGCCAGGCTAGAGTCGACGTTACGTACGTCAACTGTAGCAAAAAGTGCTTGGTTCCCAAATTCATTATCTTTGATCACGGGATACCAGAGGATACGAGGGAAAACTCAGTTCCGGTAAAAAACTTTTCGATGTTGCCCCACATCGTGTGTCTCACGCAGCCGGAGTGCCAAGGAAATCAGGGTAATATTCGGAGGACTGACAGTGCGGGGGATTATTTGGTTCCACACTCCCGGTGGGCCGATATGAAGCGTGCCGTTCCCTTGCGTCTCGTTTCGTCTCCCGGTCCCGTTCTTGTGCCTGTTCTGAAACTAATCGAAGTCCGTGTCTTAACCAGTGTAGATCTTGTGGTCTACGAGTGCTCTGTTCAAGGTGATAAACATCTCGCTCTAAAACAATAATACACATCTCGCCAATCGAGATCTGCCGTGGAGTGTAGCCGAAAGAAGATATTTCGAGTGGTGCCAGATCACCCAGACTTAGTTGTCGGTTTTCCCCCGCGAGGACGGGATCATGTATGGGGATTCTTTTACATCATCAAAACCCGCCTCGGATGTGTCCGCTCTGTTGGATTGGAGCTCCTTACCTAAGATTGGGAAAACA | |
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG | |
CGGTAAGGCAATCCTGATCATGGTGGGTTAACGGAATGGTGAGAGCCTACATCGTGCACCTTGAGAATAATGGTCGGCTTCCCATTAGGTGCATGAGCCTAGGCCGCGGGATGTGGCGGTGTCGAGCACGACCATAAATTACCAGATTGATGCTTCGGCGCGCAATGGAGAAATGGCACACTCAGGAGTCCTGCTTTGACGCGTGACTGGTTATGGTCAAGCCACCATGGCTTTTATGTGGGCTTGCTTGACGGAGACTATCGCATAGAGCCGCTGAAAGTCCATCATAAGACGCATCAACACTGTGGGGCATCTAAACCCCATGCCATTTGCTTTAATTGACGAGTTAGATCTTAGAATTATCACTTTCGCGCTATCCTGTGAGGCAGCCTACACTGAGGACGCACCTAGCTAGGCACCGCTATGGGCCATGATTTTGGTGCCTGCGCTGCTTCCATGGCTGGCTCAGAAGATTCCCCTTGCATTGCCGCTAACAGACCACGGAAGGGACCGTGGGGCTTCCCTAATCTGGAGTCCTCGAGCAGAACCTACACTTAATAGGTAGGGATCAACGTTGGTCCGCACTAAACTAATGGACCGTGCCAACAGGAGTTCTTATTCGCTCATGAAGGTCTAGACTAGGCCGAGAAATGTGCTAAAACCAGGACACCCGTACTTTGAAATTCATTATGATCCATCATTGAGTCTGCCTGAAGTATGAAGAGAACCACATTTCAACTTACAGCCCAAGATCTCGACGGATTTGGATGCATGATTCTATCTAAGATACATCGTTTCGCCCTCGCTATTCACAATTGTACGTCATGTAGTATATCGACTCACCCGCGCGCTTCCGGATAACACCTGCACTGATGTGAGAAACGGTCTGCATTGGCTCATATGTTATCGTTAAACTTTGCACGTAAAAAGGCTGTCGACCTAGACTCTGTTAGCAAAACCACGTCGCCAACCTTCCGGCGTGGAACCCGGAGTGGTCGTG | |
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG | |
TCAGTCAGTGTAGCCCTGGCGCGTACACGATGCCGCTGTTGAGCTGCGGGCATGAGTACTCGCGCTGAAACTCCCAATCATCCTTACGCCTACCAGCCTACAATAAGAGAGTTCTATAGGAACGGCCGCTCTGGCTAATGTCTATTTCTAAGTTACCTTATCAGGTGAGCCATGCCACCCGTAAACCCTTGTCGTGTGCGTTTTATTCTTAATAAACTCACCGTGACCGTGAAGTTCTATTCGGAGTCCTCCCGAGGGCATACGCTGCGAGCGGTGTGTCACTCCACACGGCGCACGTGAGCGCTCTATATTTCCCAGGACCGTACGCTAAGTGGACGTTCAGATAGGATGAGTGAGACGCAGGTGGGATCACTTAGAATTAGGTGCCACTGTTAAGGCCACGTGGCGTTGGTGTGAATGAGTCGCATACTATACATTCTTGTAGGATTGTAATTGATCCTGCCCCAAAATGGTTGCATGGAAGCGAATTAACGGAAAACTAGTCTAGACCTCTTGCGGGTCTCTCTTGACGTGGCAATGGAACCAGACAACGCATCAGAGAAATCTATACATCCATCCCACGTTTCTTGGTATAGTCTGATGTTATTTGCTGGACCACCTTCCCAATTAGGGAGAGTATAGCTACGTGTATAATAAAAGCTTTTCCTTGCGAACGGATTGGGACGTACACATGAGTTAAACATGGCGAAGCTCCTTCCACCCGCCCAGGGAGGGACTGCCCTTAGTCGTTCAGGCGATTAGGGTCCTAATACGTTTAACAACGCTCAGCGCTGGTAAATCGAGGGAAGCAGTGAATCCCCCCGGACGCCCCCGGCGGCTAGCATGAGGCGCGATTCCGCTGTGTCCAACTTTCCGGAAAATGAAAAGAAGTCGGCACTGCCGAAGACAACTCTTTCCGAAGGTATGATAAGGGCCTAGCACTGATACTCTAGTTAAGAACATAGGAACTATGCACCTGTGCATGTACGTTTCATCCTTA | |
TCCGTGCGCGGGTGGCTCAGCCCCAGTTGGTAAGCGGTACAGCCGAGACGTCCTCACAACATATCCACGCTTAGCATAAGCAGCCGGTTCCAACATATTTTACAGTAGGAAAGTGCCGTGCCATGCATCCGACATACTATTGGTCCGAGTACAGGGTGTTACACAATGTACTCGATCAGGAACGGACTGGACTCTTGGTTTGGTTGAGTGGATTATGTGTGTTATTGAAATTTCGCAACGATTCTTTTGAGCACATCAATAGCTGGCCTTGAGCCCGTCGATCAAGTGTCAATGCTGCACAATTATACGGTTCACACACACCATTTGTCGTTGTGACTATCCTGAACGGCGTGAAAAGCTTTGTGACAAGGATCAATCGCCTCTGAGCGAGCCAAAATCCAGCGCGTGAAGACACATTTAAGTACCGAGCGATGTGTATCTGGGCGCGAATCGCACAGATGCCCTCTTCCGGACATCTGGTGAAACGCGCAACCTTCTCG | |
GCCCCAACGAGCCTCGTGCGTTGATAATACCAATATGATCAAGGCCAGTCTAGGTTGATTACTTCGGAACACAGCGGACCCAGTGACAGAAAGACCCATGTCGAATGCGGGTATTACGATAGAAGTATGCAGTGACTGGTGCCTAATGGAGGAACCCGAAGTGTTCATGAATCCGATCAATCTGACCCGATTCTTGAAGGTACCCATTATCCTACTCGGATTGTACAAGTGGGCTTGCCCAGAATTAAATTAATGTATTTCTTGATCATTAAGAGTGGCTACGGAGAGCCCTTGACTGGAATCTTAGCCCAGCCATGCCTTTGACCCGTAGTCGATACGTAAATCCGTTTAACCTCCTCTTGGAAGTAACCTTTGCAGGCAGGCGACCGCACTTCTTAAACAGGAACCATGCCCTTGCCCGAAGGTCGCGCTAGTTCTGAATCCGTGGCCGCGTTATACGTTTCGAAGATTGTTGAACCCTTAACACGATCTCATCGGCGACTCTCCACGCAATGAGAGTAAGTAGTGGCCCTAGTGGTCTCCCCGGCCGGAGTGCCAAGTGCGTCGAAGTTTGCCTCTTGTCAACATGGGACGGGCAGACCCTAACGCCGAACTCTTCGTGACCTCGGTATTGACGGCCTAAAAATGCTCGTAGCGGGCTGGGCCAATATTGGAGCATGGATTACCTATGCCAGGTGAGGGAAAAACTTATCATCCAAACTGTTACCGAACACCGGAAAGGTTGTCGTAAACGGTGGCTGAACTTGATTGAGCTCGGTGAATCTATCGATTGCTTCTATGGACTGGTAGTGTCCAGCGACTTATCGTCATGGTGTCCCGATGATTTCCACAAGAGTCGATCGGCCTCAATGTGGTGCCTCCTGAATACGAGATTAAATAAGATCCAGAAGCCCTTTGCGGTTAACGCCCGATAACCAGAGCGGCCTTCGATCTCTCCCGAACGCAGAGAGTTGTCTTACCCGAGATTTTTTCAGATGGC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment