Skip to content

Instantly share code, notes, and snippets.

@betatim
Created January 27, 2017 08:05
Show Gist options
  • Save betatim/9730aa06cd2ec0fb7d0cec7cfec8969b to your computer and use it in GitHub Desktop.
Save betatim/9730aa06cd2ec0fb7d0cec7cfec8969b to your computer and use it in GitHub Desktop.
BloomFilter talk at PyDataZRH January 2017
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div style=\"position: relative\">\n",
" <img style=\"position: absolute; top: 0; right: 0; max-height: 130px\" src=\"logo.jpg\" />\n",
"</div>\n",
"# Bloom Filters\n",
"\n",
"\n",
"**Tim Head, [Wild Tree Tech](https://www.wildtreetech.com)**\n",
"\n",
"\n",
"**26 January 2017**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For more have a look at: http://betatim.github.io/posts/genome-hackers/ and http://betatim.github.io/posts/improbable-afternoon/."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.utils import murmurhash3_32"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def hash_(value, seed=1):\n",
" return murmurhash3_32(value, seed=seed, positive=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"3142237357"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hash_(\"hello\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"bits = np.zeros(100, dtype=bool)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False, False], dtype=bool)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bits"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def insert(bf, value):\n",
" idx = hash_(value) % bf.size\n",
" bf[idx] += 1\n",
" return bf"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, True, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False, False], dtype=bool)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"insert(bits, \"hello\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, True, False, True, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False, False], dtype=bool)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"insert(bits, \"world\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def contains(bits, value):\n",
" idx = hash_(value) % bits.size\n",
" return bits[idx]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contains(bits, \"world\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class BloomFilter:\n",
" def __init__(self, m, k):\n",
" self.bits = np.zeros(m, dtype=bool)\n",
" self.k = k\n",
" \n",
" def insert(self, value):\n",
" for i in range(self.k):\n",
" idx = hash_(value, seed=1+i) % self.bits.size\n",
" self.bits[idx] += 1\n",
" \n",
" def contains(self, value):\n",
" for i in range(self.k):\n",
" idx = hash_(value, seed=1+i) % self.bits.size\n",
" if not self.bits[idx]:\n",
" return False\n",
" \n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"bf = BloomFilter(100, 3)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"bf.insert(\"world\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, True, False, False, False, True, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, True, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False, False, False, False], dtype=bool)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.bits"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.contains(\"world\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.contains(\"hello\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"graph = BloomFilter(1000, 3)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"graph.insert(\"ACGTT\")\n",
"graph.insert(\"CGTTA\")\n",
"graph.insert(\"GTTAT\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"9"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graph.bits.sum()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def next_nodes(node):\n",
" for c in \"ACGT\":\n",
" yield node[1:] + c"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def walk(graph, start):\n",
" now = start\n",
" while True:\n",
" print(now, '->', end=\" \")\n",
" for nxt in next_nodes(now):\n",
" if graph.contains(nxt):\n",
" now = nxt\n",
" break\n",
" else:\n",
" break\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ACGTT -> CGTTA -> GTTAT -> "
]
}
],
"source": [
"walk(graph, \"ACGTT\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment