Created
January 27, 2017 08:05
-
-
Save betatim/9730aa06cd2ec0fb7d0cec7cfec8969b to your computer and use it in GitHub Desktop.
BloomFilter talk at PyDataZRH January 2017
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<div style=\"position: relative\">\n", | |
" <img style=\"position: absolute; top: 0; right: 0; max-height: 130px\" src=\"logo.jpg\" />\n", | |
"</div>\n", | |
"# Bloom Filters\n", | |
"\n", | |
"\n", | |
"**Tim Head, [Wild Tree Tech](https://www.wildtreetech.com)**\n", | |
"\n", | |
"\n", | |
"**26 January 2017**" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For more have a look at: http://betatim.github.io/posts/genome-hackers/ and http://betatim.github.io/posts/improbable-afternoon/." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.utils import murmurhash3_32" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def hash_(value, seed=1):\n", | |
" return murmurhash3_32(value, seed=seed, positive=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"3142237357" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hash_(\"hello\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"bits = np.zeros(100, dtype=bool)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False, False], dtype=bool)" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bits" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def insert(bf, value):\n", | |
" idx = hash_(value) % bf.size\n", | |
" bf[idx] += 1\n", | |
" return bf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, True, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False, False], dtype=bool)" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"insert(bits, \"hello\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, True, False, True, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False, False], dtype=bool)" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"insert(bits, \"world\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def contains(bits, value):\n", | |
" idx = hash_(value) % bits.size\n", | |
" return bits[idx]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"contains(bits, \"world\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"class BloomFilter:\n", | |
" def __init__(self, m, k):\n", | |
" self.bits = np.zeros(m, dtype=bool)\n", | |
" self.k = k\n", | |
" \n", | |
" def insert(self, value):\n", | |
" for i in range(self.k):\n", | |
" idx = hash_(value, seed=1+i) % self.bits.size\n", | |
" self.bits[idx] += 1\n", | |
" \n", | |
" def contains(self, value):\n", | |
" for i in range(self.k):\n", | |
" idx = hash_(value, seed=1+i) % self.bits.size\n", | |
" if not self.bits[idx]:\n", | |
" return False\n", | |
" \n", | |
" return True" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"bf = BloomFilter(100, 3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"bf.insert(\"world\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, True, False, False, False, True, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, True, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False,\n", | |
" False, False, False, False, False, False, False, False, False, False], dtype=bool)" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bf.bits" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bf.contains(\"world\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"False" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bf.contains(\"hello\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"graph = BloomFilter(1000, 3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"graph.insert(\"ACGTT\")\n", | |
"graph.insert(\"CGTTA\")\n", | |
"graph.insert(\"GTTAT\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"9" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"graph.bits.sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def next_nodes(node):\n", | |
" for c in \"ACGT\":\n", | |
" yield node[1:] + c" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def walk(graph, start):\n", | |
" now = start\n", | |
" while True:\n", | |
" print(now, '->', end=\" \")\n", | |
" for nxt in next_nodes(now):\n", | |
" if graph.contains(nxt):\n", | |
" now = nxt\n", | |
" break\n", | |
" else:\n", | |
" break\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ACGTT -> CGTTA -> GTTAT -> " | |
] | |
} | |
], | |
"source": [ | |
"walk(graph, \"ACGTT\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment