Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Created September 16, 2013 15:46
Show Gist options
  • Save BenLangmead/6582444 to your computer and use it in GitHub Desktop.
Save BenLangmead/6582444 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "003_CG_InvertedIndex1"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# A simple example of a substring index; mirrors example from lecture notes\n",
"\n",
"# we're going to extract 4 substrings like this:\n",
"# t: CGTGCCTACTTACTTACAT\n",
"# substring 1: CGTGC\n",
"# substring 2: CCTAC\n",
"# substring 3: CTTAC\n",
"# substring 4: CTTAC\n",
"t = 'CGTGCCTACTTACTTACAT'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# From t, make list of pairs, where first pair item is substring, second is its offset\n",
"def substringize(t, ln, iv):\n",
" # ln = length of substrings to extract\n",
" # iv = distance between substings to extract; e.g. 1 means take *every* substring\n",
" pairs = []\n",
" # Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)\n",
" for i in xrange(0, len(t) - ln + 1, iv):\n",
" pairs.append((t[i:i+ln], i))\n",
" return pairs\n",
" # Could also have used list comprehension:\n",
" # return [ (t[i:i+ln], i) for i in xrange(0, len(t) - ln + 1, iv) ]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"substringize('CGTGCCTACTTACTTACAT', 5, 4)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 3,
"text": [
"[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Like substringize, but uses a map data structure\n",
"def mapize(t, ln, iv):\n",
" index = {}\n",
" for i in xrange(0, len(t) - ln + 1, iv):\n",
" sub = t[i:i+ln]\n",
" if sub in index:\n",
" index[sub].append(i) # already have an entry; append to it\n",
" else:\n",
" index[sub] = [i] # don't yet have entry, make new one\n",
" return index"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index = mapize('CGTGCCTACTTACTTACAT', 5, 4)\n",
"index"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 5,
"text": [
"{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p = 'CTTACTTA'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# index: give me a hint where I should look for occurrences of p in t\n",
"index[p[:5]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 7,
"text": [
"[8, 12]"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment