Skip to content

Instantly share code, notes, and snippets.

@jmhobbs
Created January 24, 2012 23:48
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jmhobbs/1673557 to your computer and use it in GitHub Desktop.
Naive Search with JavaScript
#!/usr/bin/env python
import json
import re
import pprint
def tokenize ( string ):
# Strip extra punctuation
string = re.sub( r'[^a-z0-9A-Z \'\-]', '', string.lower() )
return string.split( ' ' )
def main ():
index = {}
with open( 'data.json', 'r' ) as handle:
obj = json.loads( handle.read() )
for entry in obj['entries']:
# Break up both title and body
tokens = tokenize( entry['title'] )
tokens.extend( tokenize( entry['body'] ) )
# Make them unique by casting to set
tokens = set( tokens )
# Now add them to the index
for token in tokens:
# Make a new entry for the token if it doesn't exist
if token not in index.keys():
index[token] = []
# Add this id to the list of matches for this token
index[token].append( entry['id'] )
pprint.pprint( index )
if __name__ == '__main__':
main()
{
"entries": [
{
"id": 1,
"title": "The lazy white cat slept.",
"body": "What a lazy cat."
},
{
"id": 2,
"title": "George, though angry, didn't make a sound.",
"body": "George is a quiet man."
},
{
"id": 3,
"title": "Anyone could see that white didn't suit her.",
"body": "Plus, it's after Labor Day."
},
{
"id": 4,
"title": "By Thor's Hammer, I will have my revenge.",
"body": "Also, by Odin's Eye"
},
{
"id": 5,
"title": "Get off the couch you lazy bum.",
"body": "Yeah, it's way better to sit at a computer desk."
}
]
}
$ python build_index.py
{u'a': [1, 2, 5],
u'after': [3],
u'also': [4],
u'angry': [2],
u'anyone': [3],
u'at': [5],
u'better': [5],
u'bum': [5],
u'by': [4],
u'cat': [1],
u'computer': [5],
u'couch': [5],
u'could': [3],
u'day': [3],
u'desk': [5],
u"didn't": [2, 3],
u'eye': [4],
u'george': [2],
u'get': [5],
u'hammer': [4],
u'have': [4],
u'her': [3],
u'i': [4],
u'is': [2],
u"it's": [3, 5],
u'labor': [3],
u'lazy': [1, 5],
u'make': [2],
u'man': [2],
u'my': [4],
u"odin's": [4],
u'off': [5],
u'plus': [3],
u'quiet': [2],
u'revenge': [4],
u'see': [3],
u'sit': [5],
u'slept': [1],
u'sound': [2],
u'suit': [3],
u'that': [3],
u'the': [1, 5],
u"thor's": [4],
u'though': [2],
u'to': [5],
u'way': [5],
u'what': [1],
u'white': [1, 3],
u'will': [4],
u'yeah': [5],
u'you': [5]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment