Last active
April 1, 2017 15:56
-
-
Save HarryR/79a183ac0f8a0a4e67e2930f54e5cf0b to your computer and use it in GitHub Desktop.
Generate word indexes for JSON document titles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
import json | |
import re | |
from base64 import b32encode | |
from hashlib import sha1 | |
RE_SPLIT = re.compile('[\.]') | |
RE_NOTAPHALNUMERIC = re.compile('[^a-zA-Z0-9 -]') | |
RE_SPACES = re.compile('[\s]+') | |
def hash_stuff(segments): | |
hasher = sha1() | |
for word in segments: | |
if word is None: | |
word = hasher.digest() | |
hasher.update(word) | |
return b32encode(hasher.digest())[:8] | |
def segmentize(word_list, max_N=3): | |
""" | |
Splits the list of words into a set of unique segments up to max_N long | |
e.g. one two thr | |
with max_N = 3, becomes: | |
[None, "one", "two"] | |
["one", "two", "thr"] | |
["two", "thr", None] | |
[None, "one"] | |
["one", "two"] | |
["two", "thr"] | |
["thr", None] | |
["one"] | |
["two"] | |
["thr"] | |
""" | |
tmp_list = [None] + word_list + [None] | |
segments_list = set( | |
filter( | |
lambda x: x and x != (None,), | |
[ | |
tuple(tmp_list[word_N-N+1:word_N+1]) | |
for N in range(2, max_N + 1) | |
for word_N, word in enumerate(tmp_list) | |
] | |
) | |
) | |
return filter( | |
lambda segments: | |
(None,) + segments not in segments_list | |
and segments + (None,) not in segments_list, | |
segments_list) + [[W] for W in set(word_list)] | |
def split_words(title): | |
"""Split title into words, removing punctuation, etc.""" | |
words = RE_SPLIT.sub(' ', RE_NOTAPHALNUMERIC.sub('', title.lower())) | |
return RE_SPACES.sub(' ', words).strip().split(' ') | |
with open("outfile.json") as infile: | |
for line in infile: | |
data = json.loads(line) | |
words = split_words(data['title']) | |
segments = {hash_stuff(segment): segment | |
for segment in segmentize(words)} | |
print(json.dumps({data['id']: segments})) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment