Last active
June 6, 2018 14:53
-
-
Save angellandros/25a66cee2989dd05e9f4d18aabc4afb8 to your computer and use it in GitHub Desktop.
Inverted Index on 5-Shingles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
T = {'1' : ' payday', '2' : 'mayday mayday', '3' : 'day may'} | |
index = defaultdict(set) | |
# create 5-shingles | |
for k, v in T.items(): | |
for i in range(len(v) - 4): | |
index[v[i:i+5]].add(k) | |
# latex representation of the sets | |
def set_tex(s): | |
ss = ', '.join(s) | |
return '$\{ %s \}$' % ss | |
# latex tabular | |
for k, v in sorted(index.items(), key=lambda x: x[0]): | |
print(k.replace(' ', '{}\\textvisiblespace{}'), '&', set_tex(v), '\\\\') | |
# represent sets of words | |
def set_tex_word(s): | |
ss = ','.join(['\\mbox{%s}' % x for x in sorted(s)]) | |
return '$\{ %s \}$' % ss.replace(' ', '{}\\textvisiblespace{}').replace(',', ', ') | |
# create sets of 5-shingles | |
TT = defaultdict(set) | |
for k, v in T.items(): | |
for i in range(len(v) - 4): | |
TT[k].add(v[i:i+5]) | |
print(set_tex_1(TT['1'])) | |
print(set_tex_1(TT['2'])) | |
print(set_tex_1(TT['3'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment