Created
March 5, 2017 18:03
-
-
Save daniel-j-h/c626acfd32b51e7e1fe9e0aedd557514 to your computer and use it in GitHub Desktop.
q-gram inverted lists index for fuzzy autocompletion - works surprisingly well already
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import argparse | |
import collections | |
import curses | |
import curses.textpad | |
#python3 -m venv --system-site-packages venv | |
#. venv/bin/activate | |
#python3 -m pip install osmium | |
import osmium | |
class StreetNameCollector(osmium.SimpleHandler): | |
def __init__(_): | |
super().__init__() | |
_.names = [] | |
def way(_, w): | |
if not 'highway' in w.tags: | |
return | |
if not 'name' in w.tags: | |
return | |
_.names.append(w.tags['name']) | |
def normalize(_): | |
_.names = list(set(_.names)) | |
def ngrams(seq, n): | |
suffixes = (seq[i:] for i in range(n)) | |
parallel = zip(*suffixes) | |
combine = lambda v: ''.join(v) | |
return map(combine, parallel) | |
def bigrams(seq): | |
return ngrams(seq, 2) | |
def jaccard(a, b): | |
intersection = len(set.intersection(a, b)) | |
union = len(a) + len(b) - intersection | |
return intersection / union | |
def makeGramInvertedIndex(seq, gramer, normalizer): | |
index = collections.defaultdict(list) | |
for idx, v in enumerate(seq): | |
grams = gramer(normalizer(v)) | |
for gram in grams: | |
index[gram].append(idx) | |
return index | |
def makeBigramInvertedIndex(seq, normalizer): | |
return makeGramInvertedIndex(seq, bigrams, normalizer) | |
def normalize(text): | |
return text.lower() | |
def shell(stdscr, names, index): | |
curses.start_color() | |
curses.use_default_colors() | |
curses.curs_set(0) | |
k = 10 | |
query = [] | |
while True: | |
c = stdscr.getch() | |
if 31 < c < 127: | |
query.append(chr(c)) | |
text = ''.join(query) | |
grams = list(bigrams(text)) | |
stdscr.addstr(1, 2, 'Input: {}'.format(text), curses.A_BOLD) | |
stdscr.addstr(2, 2, 'Grams: {}'.format(' '.join(grams)), curses.A_BOLD) | |
# tbd: lookup should be in own type | |
flatten = lambda v: sum(v, []) | |
ids = set(flatten([index[gram] for gram in grams])) | |
similarity = lambda v: jaccard(set(bigrams(normalize(v))), set(grams)) | |
suggestions = [names[i] for i in ids] | |
suggestions.sort(key=similarity, reverse=True) | |
for offset, v in enumerate(suggestions[:k]): | |
stdscr.addstr(4 + offset, 2, '{:.2f} {}'.format(similarity(v), v)) | |
else: | |
query = [] | |
stdscr.clear() | |
def main(): | |
args = arguments() | |
collector = StreetNameCollector() | |
collector.apply_file(filename=args.osm) | |
collector.normalize() | |
print('Collected {} street names from osm input extract'.format(len(collector.names))) | |
index = makeBigramInvertedIndex(collector.names, normalize) | |
print('Built gram inverted list index with {} gram keys'.format(len(index))) | |
curses.wrapper(shell, collector.names, index) | |
def arguments(): | |
parser = argparse.ArgumentParser(description='geocoder for osm street names', | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('osm', type=str, help='osm base map (e.g. monaco.osm.pbf)') | |
return parser.parse_args() | |
if __name__ == '__main__': | |
try: | |
main() | |
except KeyboardInterrupt: | |
sys.exit('\nBye') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment