Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Make your chrome bookmarks into a searchable, tokenized inverted index
"""Tokenize bookmark titles and create an inverted index."""
import re
import os
import sys
from pprint import pprint as ppr
from collections import Counter, defaultdict
from pyquery import PyQuery as pq
def fmt_token(regex, t):
return str(re.sub(regex, '', t.lower()))
def make_iidex():
iidex = defaultdict(list)
path = '{}/CHROME_bookmarks_6_14_17.html'.format(os.getcwd())
replace_re = re.compile(r'[^a-zA-Z]+')
totals = []
# TODO: nltk stopwords
stopwords = [
'the', 'and', 'is', 'for', 'in', 'of', 'to', 'with',
]
with open(path, 'r') as bmarks:
dom = pq(bmarks.read())
titles = dom.find('DT > a')
for title in titles:
if title.text is None:
continue
tokens = title.text.split(' ')
tokens = [fmt_token(replace_re, t) for t in tokens]
tokens = [t for t in tokens if t not in stopwords and len(t) > 1]
for token in tokens:
iidex[token].append(title.text)
totals.append(token)
distribution = Counter(totals)
return distribution, iidex
def interact(distr, iidex):
def find(term):
matches = []
for k, v in iidex.items():
if term in k:
matches += v
matches = list(set(matches))
print('\n--- Searching for "{term}", found {amt} results. --\n'.format(
term=term, amt=len(matches)
))
for i, m in enumerate(matches):
print('{}{}. {}\n'.format(' ' * 4, i, m))
print('=== Use `find("xxx")` to explore something.')
import pdb; pdb.set_trace()
if __name__ == '__main__':
distribution, iidex = make_iidex()
if '-i' in sys.argv:
interact(distribution, iidex)
else:
ppr(iidex)
ppr(distribution.most_common(20))
ppr(iidex.keys()[:20])
@christabor
Copy link
Author

TODO: handle unicode issues

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment