Skip to content

Instantly share code, notes, and snippets.

@skorokithakis
Created December 7, 2013 22:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save skorokithakis/0abbfebced25fd4b2ed3 to your computer and use it in GitHub Desktop.
Save skorokithakis/0abbfebced25fd4b2ed3 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "Bloom filters"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "from pybloom import BloomFilter\nimport os\nimport re",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Read all my posts.\nposts = {post_name: open(POST_DIR + post_name).read() for post_name in os.listdir(POST_DIR)}\n# Create a dictionary of {\"post name\": \"lowercase word set\"}.\nsplit_posts = {name: set(re.split(\"\\W+\", contents.lower())) for name, contents in posts.items()}",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": "filters = {}\nfor name, words in split_posts.items():\n filters[name] = BloomFilter(capacity=len(words), error_rate=0.1)\n for word in words:\n filters[name].add(word)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": "def search(search_string):\n search_terms = re.split(\"\\W+\", search_string)\n return [name for name, filter in filters.items() if all(term in filter for term in search_terms)]",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": "search(\"android raspberry\")",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 13,
"text": "['2013-12-07 - bloom-filter-search-engine.md',\n '2013-06-19 - how-remote-control-rf-devices-raspberry-pi.md',\n '2013-06-24 - writing-my-first-android-app-control-your-raspberr.md']"
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Average filter length per post, in bytes.\nsum(len(filter.bitarray.tobytes()) for filter in filters.values()) / len(filters)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 29,
"text": "295"
}
],
"prompt_number": 29
},
{
"cell_type": "code",
"collapsed": false,
"input": "big_filter = BloomFilter(capacity=sum(len(words) for words in split_posts.values()), error_rate=0.1)\nfor name, words in split_posts.items():\n for word in words:\n big_filter.add(name + word)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": "def search_big(search_string):\n search_terms = re.split(\"\\W+\", search_string)\n return [name for name in split_posts if all((name + term) in big_filter for term in search_terms)]",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": "search_big(\"android raspberry\")",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 25,
"text": "['2013-12-07 - bloom-filter-search-engine.md',\n '2013-06-19 - how-remote-control-rf-devices-raspberry-pi.md',\n '2013-06-24 - writing-my-first-android-app-control-your-raspberr.md']"
}
],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": "len(big_filter.bitarray.tobytes()) / len(split_posts)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 28,
"text": "294"
}
],
"prompt_number": 28
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment