Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Created November 6, 2013 16:33
Show Gist options
  • Save BenLangmead/7339417 to your computer and use it in GitHub Desktop.
Save BenLangmead/7339417 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"def neighbors1mm(kmer, alpha):\n",
" ''' Generate all neighbors at Hamming distance 1 from kmer '''\n",
" neighbors = []\n",
" for j in xrange(len(kmer)-1, -1, -1):\n",
" oldc = kmer[j]\n",
" for c in alpha:\n",
" if c == oldc: continue\n",
" neighbors.append(kmer[:j] + c + kmer[j+1:])\n",
" return neighbors"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"neighbors1mm('CAT', 'ACGT')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"['CAA', 'CAC', 'CAG', 'CCT', 'CGT', 'CTT', 'AAT', 'GAT', 'TAT']"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def kmerHist(reads, k):\n",
" ''' Return k-mer histogram and average # k-mer occurrences '''\n",
" kmerhist = {}\n",
" for read in reads:\n",
" for kmer in [ read[i:i+k] for i in xrange(0, len(read)-(k-1)) ]:\n",
" kmerhist[kmer] = kmerhist.get(kmer, 0) + 1\n",
" return kmerhist"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"khist = kmerHist(['CAT' * 10], 3)\n",
"khist"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"{'ATC': 9, 'CAT': 10, 'TCA': 9}"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def correct1mm(read, k, kmerhist, alpha, thresh):\n",
" ''' Return an error-corrected version of read. k = k-mer length.\n",
" kmerhist is kmer count map. alpha is alphabet. thresh is\n",
" count threshold above which k-mer is considered correct. '''\n",
" # Iterate over k-mers in read\n",
" for i in xrange(0, len(read)-(k-1)):\n",
" kmer = read[i:i+k]\n",
" # If k-mer is infrequent...\n",
" if kmerhist.get(kmer, 0) <= thresh:\n",
" # Look for a frequent neighbor\n",
" for newkmer in neighbors1mm(kmer, alpha):\n",
" if kmerhist.get(newkmer, 0) > thresh:\n",
" # Found a frequent neighbor; replace old kmer\n",
" # with neighbor\n",
" read = read[:i] + newkmer + read[i+k:]\n",
" break\n",
" # Return possibly-corrected read\n",
" return read"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"correct1mm('CAT', 3, khist, 'ACGT', 2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"'CAT'"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"correct1mm('CTT', 3, khist, 'ACGT', 2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"'CAT'"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment