Make your chrome bookmarks into a searchable, tokenized inverted index
"""Tokenize bookmark titles and create an inverted index."""
import re
import os
import sys
from pprint import pprint as ppr
from collections import Counter, defaultdict
from pyquery import PyQuery as pq
def fmt_token(regex, t):
return str(re.sub(regex, '', t.lower()))
def make_iidex():
iidex = defaultdict(list)
path = '{}/CHROME_bookmarks_6_14_17.html'.format(os.getcwd())
replace_re = re.compile(r'[^a-zA-Z]+')
totals = []
# TODO: nltk stopwords
stopwords = [
'the', 'and', 'is', 'for', 'in', 'of', 'to', 'with',
with open(path, 'r') as bmarks:
dom = pq(
titles = dom.find('DT > a')
for title in titles:
if title.text is None:
tokens = title.text.split(' ')
tokens = [fmt_token(replace_re, t) for t in tokens]
tokens = [t for t in tokens if t not in stopwords and len(t) > 1]
for token in tokens:
distribution = Counter(totals)
return distribution, iidex
def interact(distr, iidex):
def find(term):
matches = []
for k, v in iidex.items():
if term in k:
matches += v
matches = list(set(matches))
print('\n--- Searching for "{term}", found {amt} results. --\n'.format(
term=term, amt=len(matches)
for i, m in enumerate(matches):
print('{}{}. {}\n'.format(' ' * 4, i, m))
print('=== Use `find("xxx")` to explore something.')
import pdb; pdb.set_trace()
if __name__ == '__main__':
distribution, iidex = make_iidex()
if '-i' in sys.argv:
interact(distribution, iidex)
TODO: handle unicode issues

