HarryR/titleindex.py

## titleindex.py
#!/usr/bin/env python
from __future__ import print_function

import json
import re
from base64 import b32encode
from hashlib import sha1


RE_SPLIT = re.compile('[\.]')
RE_NOTAPHALNUMERIC = re.compile('[^a-zA-Z0-9 -]')
RE_SPACES = re.compile('[\s]+')


def hash_stuff(segments):
    hasher = sha1()
    for word in segments:
        if word is None:
            word = hasher.digest()
        hasher.update(word)
    return b32encode(hasher.digest())[:8]


def segmentize(word_list, max_N=3):
    """
    Splits the list of words into a set of unique segments up to max_N long
    e.g. one two thr
    with max_N = 3, becomes:

      [None, "one", "two"]
      ["one", "two", "thr"]
      ["two", "thr", None]
      [None, "one"]
      ["one", "two"]
      ["two", "thr"]
      ["thr", None]
      ["one"]
      ["two"]
      ["thr"]
    """
    tmp_list = [None] + word_list + [None]
    segments_list = set(
        filter(
            lambda x: x and x != (None,),
            [
                tuple(tmp_list[word_N-N+1:word_N+1])
                for N in range(2, max_N + 1)
                for word_N, word in enumerate(tmp_list)
            ]
        )
    )
    return filter(
        lambda segments:
            (None,) + segments not in segments_list
            and segments + (None,) not in segments_list,
        segments_list) + [[W] for W in set(word_list)]


def split_words(title):
    """Split title into words, removing punctuation, etc."""
    words = RE_SPLIT.sub(' ', RE_NOTAPHALNUMERIC.sub('', title.lower()))
    return RE_SPACES.sub(' ', words).strip().split(' ')


with open("outfile.json") as infile:
    for line in infile:
        data = json.loads(line)
        words = split_words(data['title'])
        segments = {hash_stuff(segment): segment
                    for segment in segmentize(words)}
        print(json.dumps({data['id']: segments}))
	#!/usr/bin/env python
	from __future__ import print_function

	import json
	import re
	from base64 import b32encode
	from hashlib import sha1


	RE_SPLIT = re.compile('[\.]')
	RE_NOTAPHALNUMERIC = re.compile('[^a-zA-Z0-9 -]')
	RE_SPACES = re.compile('[\s]+')


	def hash_stuff(segments):
	hasher = sha1()
	for word in segments:
	if word is None:
	word = hasher.digest()
	hasher.update(word)
	return b32encode(hasher.digest())[:8]


	def segmentize(word_list, max_N=3):
	"""
	Splits the list of words into a set of unique segments up to max_N long
	e.g. one two thr
	with max_N = 3, becomes:

	[None, "one", "two"]
	["one", "two", "thr"]
	["two", "thr", None]
	[None, "one"]
	["one", "two"]
	["two", "thr"]
	["thr", None]
	["one"]
	["two"]
	["thr"]
	"""
	tmp_list = [None] + word_list + [None]
	segments_list = set(
	filter(
	lambda x: x and x != (None,),
	[
	tuple(tmp_list[word_N-N+1:word_N+1])
	for N in range(2, max_N + 1)
	for word_N, word in enumerate(tmp_list)
	]
	)
	)
	return filter(
	lambda segments:
	(None,) + segments not in segments_list
	and segments + (None,) not in segments_list,
	segments_list) + [[W] for W in set(word_list)]


	def split_words(title):
	"""Split title into words, removing punctuation, etc."""
	words = RE_SPLIT.sub(' ', RE_NOTAPHALNUMERIC.sub('', title.lower()))
	return RE_SPACES.sub(' ', words).strip().split(' ')


	with open("outfile.json") as infile:
	for line in infile:
	data = json.loads(line)
	words = split_words(data['title'])
	segments = {hash_stuff(segment): segment
	for segment in segmentize(words)}
	print(json.dumps({data['id']: segments}))