-
-
Save skorokithakis/0abbfebced25fd4b2ed3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Bloom filters" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from pybloom import BloomFilter\nimport os\nimport re", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# Read all my posts.\nposts = {post_name: open(POST_DIR + post_name).read() for post_name in os.listdir(POST_DIR)}\n# Create a dictionary of {\"post name\": \"lowercase word set\"}.\nsplit_posts = {name: set(re.split(\"\\W+\", contents.lower())) for name, contents in posts.items()}", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "filters = {}\nfor name, words in split_posts.items():\n filters[name] = BloomFilter(capacity=len(words), error_rate=0.1)\n for word in words:\n filters[name].add(word)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "def search(search_string):\n search_terms = re.split(\"\\W+\", search_string)\n return [name for name, filter in filters.items() if all(term in filter for term in search_terms)]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search(\"android raspberry\")", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": "['2013-12-07 - bloom-filter-search-engine.md',\n '2013-06-19 - how-remote-control-rf-devices-raspberry-pi.md',\n '2013-06-24 - writing-my-first-android-app-control-your-raspberr.md']" | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# Average filter length per post, in bytes.\nsum(len(filter.bitarray.tobytes()) for filter in filters.values()) / len(filters)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 29, | |
"text": "295" | |
} | |
], | |
"prompt_number": 29 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "big_filter = BloomFilter(capacity=sum(len(words) for words in split_posts.values()), error_rate=0.1)\nfor name, words in split_posts.items():\n for word in words:\n big_filter.add(name + word)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "def search_big(search_string):\n search_terms = re.split(\"\\W+\", search_string)\n return [name for name in split_posts if all((name + term) in big_filter for term in search_terms)]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search_big(\"android raspberry\")", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 25, | |
"text": "['2013-12-07 - bloom-filter-search-engine.md',\n '2013-06-19 - how-remote-control-rf-devices-raspberry-pi.md',\n '2013-06-24 - writing-my-first-android-app-control-your-raspberr.md']" | |
} | |
], | |
"prompt_number": 25 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "len(big_filter.bitarray.tobytes()) / len(split_posts)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 28, | |
"text": "294" | |
} | |
], | |
"prompt_number": 28 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment