Created
November 6, 2013 16:33
-
-
Save BenLangmead/7339417 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def neighbors1mm(kmer, alpha):\n", | |
" ''' Generate all neighbors at Hamming distance 1 from kmer '''\n", | |
" neighbors = []\n", | |
" for j in xrange(len(kmer)-1, -1, -1):\n", | |
" oldc = kmer[j]\n", | |
" for c in alpha:\n", | |
" if c == oldc: continue\n", | |
" neighbors.append(kmer[:j] + c + kmer[j+1:])\n", | |
" return neighbors" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"neighbors1mm('CAT', 'ACGT')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
"['CAA', 'CAC', 'CAG', 'CCT', 'CGT', 'CTT', 'AAT', 'GAT', 'TAT']" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def kmerHist(reads, k):\n", | |
" ''' Return k-mer histogram and average # k-mer occurrences '''\n", | |
" kmerhist = {}\n", | |
" for read in reads:\n", | |
" for kmer in [ read[i:i+k] for i in xrange(0, len(read)-(k-1)) ]:\n", | |
" kmerhist[kmer] = kmerhist.get(kmer, 0) + 1\n", | |
" return kmerhist" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"khist = kmerHist(['CAT' * 10], 3)\n", | |
"khist" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"{'ATC': 9, 'CAT': 10, 'TCA': 9}" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def correct1mm(read, k, kmerhist, alpha, thresh):\n", | |
" ''' Return an error-corrected version of read. k = k-mer length.\n", | |
" kmerhist is kmer count map. alpha is alphabet. thresh is\n", | |
" count threshold above which k-mer is considered correct. '''\n", | |
" # Iterate over k-mers in read\n", | |
" for i in xrange(0, len(read)-(k-1)):\n", | |
" kmer = read[i:i+k]\n", | |
" # If k-mer is infrequent...\n", | |
" if kmerhist.get(kmer, 0) <= thresh:\n", | |
" # Look for a frequent neighbor\n", | |
" for newkmer in neighbors1mm(kmer, alpha):\n", | |
" if kmerhist.get(newkmer, 0) > thresh:\n", | |
" # Found a frequent neighbor; replace old kmer\n", | |
" # with neighbor\n", | |
" read = read[:i] + newkmer + read[i+k:]\n", | |
" break\n", | |
" # Return possibly-corrected read\n", | |
" return read" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"correct1mm('CAT', 3, khist, 'ACGT', 2)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"'CAT'" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"correct1mm('CTT', 3, khist, 'ACGT', 2)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"'CAT'" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment