Skip to content

Instantly share code, notes, and snippets.

@apg
Created December 8, 2009 21:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save apg/252028 to your computer and use it in GitHub Desktop.
Save apg/252028 to your computer and use it in GitHub Desktop.
# quick and dirty inverted index
import re, os
class InvertedIndex(object):
def __init__(self):
self._data = {}
def add(self, words, document):
for word in words:
self._data.setdefault(word, set())
self._data[word].add(document)
def index(self, file):
file = os.path.abspath(file)
x = open(file, 'r')
for line in x.readlines():
words = [w for w in re.split('[^a-z]', line.lower()) if len(w)]
self.add(words, file)
def search(self, terms):
terms = [re.sub('[^a-z]', '', t) for t in terms.split() if len(t)]
sets = []
for term in [t for t in terms if len(t)]:
sets.append(self._data.get(term, set()))
if len(sets):
documents = reduce(lambda a, b: a.intersect(b), sets[1:], sets[0])
else:
documents = set()
return documents
def test():
files = ['/etc/passwd', '/etc/group', '/etc/resolv.conf']
indx = InvertedIndex()
for f in files:
indx.index(f)
print 'empty: ', indx.search('empty')
print 'lookupd: ', indx.search('lookupd')
print 'hello: ', indx.search('hello')
if __name__ == '__main__':
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment