Skip to content

Instantly share code, notes, and snippets.

@enzzc
Created May 2, 2019 11:00
Show Gist options
  • Save enzzc/f0a7363c1500a601b587077a15791151 to your computer and use it in GitHub Desktop.
Save enzzc/f0a7363c1500a601b587077a15791151 to your computer and use it in GitHub Desktop.
from functools import reduce
from collections import defaultdict
class IndexBackend:
def index_document(self, doc_id, text):
raise NotImplementedError
def search(self, query):
raise NotImplementedError
class DictIndex(IndexBackend):
def __init__(self):
self._index = defaultdict(set)
def index_document(self, doc_id, text):
index = self._index
for word in tokenize(text):
index[word].add(doc_id)
def search(self, query):
index = self._index
query = tokenize(query)
results = set()
for w in query:
result = index.get(w)
if not result:
continue
result = result.copy()
if not results:
results = result
results.intersection_update(result)
return results
def tokenize(text):
stop_words = set((
'the', 'is', 'a', 'an', 'are',
'and', 'okay', 'right', 'wrong',
'that', 'this', 'those', 'both',
'thus', 'however', 'despite',
'here', 'so', 'we', 'you', 'can',
'how', 'where', 'when', 'of', 'than',
'then',
))
text = text.lower().split()
for w in text:
if w in stop_words:
continue
yield w
doc1 = 'Penguins are a group of aquatic flightless birds'
doc2 = 'Dolphins are a widely distributed and diverse group of aquatic mammals'
doc3 = 'Japanese bush warblers are Asian passerine birds more often heard than seen'
index = DictIndex()
index.index_document("d1", doc1)
index.index_document("d2", doc2)
index.index_document("d3", doc3)
while True:
inp = input('Search> ')
result = index.search(inp)
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment