Skip to content

Instantly share code, notes, and snippets.

@daniel-j-h
Created March 5, 2017 18:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daniel-j-h/c626acfd32b51e7e1fe9e0aedd557514 to your computer and use it in GitHub Desktop.
Save daniel-j-h/c626acfd32b51e7e1fe9e0aedd557514 to your computer and use it in GitHub Desktop.
q-gram inverted lists index for fuzzy autocompletion - works surprisingly well already
#!/usr/bin/env python3
import sys
import argparse
import collections
import curses
import curses.textpad
#python3 -m venv --system-site-packages venv
#. venv/bin/activate
#python3 -m pip install osmium
import osmium
class StreetNameCollector(osmium.SimpleHandler):
def __init__(_):
super().__init__()
_.names = []
def way(_, w):
if not 'highway' in w.tags:
return
if not 'name' in w.tags:
return
_.names.append(w.tags['name'])
def normalize(_):
_.names = list(set(_.names))
def ngrams(seq, n):
suffixes = (seq[i:] for i in range(n))
parallel = zip(*suffixes)
combine = lambda v: ''.join(v)
return map(combine, parallel)
def bigrams(seq):
return ngrams(seq, 2)
def jaccard(a, b):
intersection = len(set.intersection(a, b))
union = len(a) + len(b) - intersection
return intersection / union
def makeGramInvertedIndex(seq, gramer, normalizer):
index = collections.defaultdict(list)
for idx, v in enumerate(seq):
grams = gramer(normalizer(v))
for gram in grams:
index[gram].append(idx)
return index
def makeBigramInvertedIndex(seq, normalizer):
return makeGramInvertedIndex(seq, bigrams, normalizer)
def normalize(text):
return text.lower()
def shell(stdscr, names, index):
curses.start_color()
curses.use_default_colors()
curses.curs_set(0)
k = 10
query = []
while True:
c = stdscr.getch()
if 31 < c < 127:
query.append(chr(c))
text = ''.join(query)
grams = list(bigrams(text))
stdscr.addstr(1, 2, 'Input: {}'.format(text), curses.A_BOLD)
stdscr.addstr(2, 2, 'Grams: {}'.format(' '.join(grams)), curses.A_BOLD)
# tbd: lookup should be in own type
flatten = lambda v: sum(v, [])
ids = set(flatten([index[gram] for gram in grams]))
similarity = lambda v: jaccard(set(bigrams(normalize(v))), set(grams))
suggestions = [names[i] for i in ids]
suggestions.sort(key=similarity, reverse=True)
for offset, v in enumerate(suggestions[:k]):
stdscr.addstr(4 + offset, 2, '{:.2f} {}'.format(similarity(v), v))
else:
query = []
stdscr.clear()
def main():
args = arguments()
collector = StreetNameCollector()
collector.apply_file(filename=args.osm)
collector.normalize()
print('Collected {} street names from osm input extract'.format(len(collector.names)))
index = makeBigramInvertedIndex(collector.names, normalize)
print('Built gram inverted list index with {} gram keys'.format(len(index)))
curses.wrapper(shell, collector.names, index)
def arguments():
parser = argparse.ArgumentParser(description='geocoder for osm street names',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('osm', type=str, help='osm base map (e.g. monaco.osm.pbf)')
return parser.parse_args()
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.exit('\nBye')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment