tokestermw/machine_learned_index.py

## machine_learned_index.py
import click

import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable

import os
import random
import math
from bisect import bisect_left

url_to_english_words = \
    'https://raw.githubusercontent.com/dwyl/english-words/master/words.txt'


def get_english_words():
    if not os.path.isfile('words.txt'):
        import subprocess
        subprocess.call(['wget', url_to_english_words])

    with open('words.txt') as f:
        english_words = []
        for line in f:
            english_words.append(line.strip())

    return english_words


def exponential_distribution(lambda_=1.0):
    u = random.random()
    x = - math.log(u) / lambda_
    return x


def sorted_numbers(N=1000):
    # TODO: more complicated list
    numbers = [exponential_distribution() for _ in range(N)]
    numbers = sorted(numbers)
    def random_fun():
        x = random.choice(numbers)
        y = numbers.index(x)
        return x, y
    return numbers, random_fun


def sorted_hash_map(N=1000):
    # ignore the map for now, just get random hashes
    english_words = get_english_words()
    english_words = english_words[:N]
    hashes = [hash(word) for word in english_words]
    hashes = sorted(hashes)  # pseudo hash map
    def random_fun():
        index = random.randint(0, N - 1)
        word = english_words[index]
        hash_ = hashes[index]
        return word, index
    return hashes, random_fun


def get_model(dim=128):
    model = torch.nn.Sequential(
          torch.nn.Linear(1, dim),
          torch.nn.ReLU(),
          torch.nn.Linear(dim, 1),
        )
    return model


def _featurize(x):
    return torch.unsqueeze(Variable(torch.Tensor(x)), 1)


def naive_index_search(x, numbers):
    for idx, n in enumerate(numbers):
        if n > x:
            break
    return idx - 1


def bisect_search(x, numbers):
    i = bisect_left(numbers, x)
    if i:
        return i - 1
    raise ValueError


@click.command()
@click.argument('mode', type=click.Choice(['ranged', 'hash', 'bloom']))
@click.option('--n', default=1000, type=int,
    help='Size of sorted array.')
@click.option('--lr', default=9e-3, type=float,
    help='Learning rate of DL model (only parameter that matters!)')
def main(mode, n, lr):
    """CLI for creating machine learned index.
    """
    N = n

    if mode == 'ranged':
        numbers, random_fun = sorted_numbers(N)
    elif mode == 'hash':
        raise NotImplementedError
    elif mode == 'bloom':
        raise NotImplementedError

    model = get_model()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    try:
        while True:

            batch_x = []; batch_y = []
            for _ in range(32):
                x, y = random_fun()
                batch_x.append(x)
                batch_y.append(y)

            batch_x = _featurize(batch_x)
            batch_y = _featurize(batch_y)

            pred = model(batch_x) * N

            output = F.smooth_l1_loss(pred, batch_y)
            loss = output.data[0]

            print(loss)

            optimizer.zero_grad()
            output.backward()
            optimizer.step()

    except KeyboardInterrupt:
        pass

    def _test(x):
        pred = model(_featurize([x])) * N
        # idx = naive_index_search(x, numbers)
        idx = bisect_search(x, numbers)
        print('Real:', idx, 'Predicted:', float(pred.data[0]))

    _test(1.5)

    import pdb
    pdb.set_trace()


if __name__ == '__main__':
    main()
	import click

	import torch
	import torch.autograd
	import torch.nn.functional as F
	from torch.autograd import Variable

	import os
	import random
	import math
	from bisect import bisect_left

	url_to_english_words = \
	'https://raw.githubusercontent.com/dwyl/english-words/master/words.txt'


	def get_english_words():
	if not os.path.isfile('words.txt'):
	import subprocess
	subprocess.call(['wget', url_to_english_words])

	with open('words.txt') as f:
	english_words = []
	for line in f:
	english_words.append(line.strip())

	return english_words


	def exponential_distribution(lambda_=1.0):
	u = random.random()
	x = - math.log(u) / lambda_
	return x


	def sorted_numbers(N=1000):
	# TODO: more complicated list
	numbers = [exponential_distribution() for _ in range(N)]
	numbers = sorted(numbers)
	def random_fun():
	x = random.choice(numbers)
	y = numbers.index(x)
	return x, y
	return numbers, random_fun


	def sorted_hash_map(N=1000):
	# ignore the map for now, just get random hashes
	english_words = get_english_words()
	english_words = english_words[:N]
	hashes = [hash(word) for word in english_words]
	hashes = sorted(hashes) # pseudo hash map
	def random_fun():
	index = random.randint(0, N - 1)
	word = english_words[index]
	hash_ = hashes[index]
	return word, index
	return hashes, random_fun


	def get_model(dim=128):
	model = torch.nn.Sequential(
	torch.nn.Linear(1, dim),
	torch.nn.ReLU(),
	torch.nn.Linear(dim, 1),
	)
	return model


	def _featurize(x):
	return torch.unsqueeze(Variable(torch.Tensor(x)), 1)


	def naive_index_search(x, numbers):
	for idx, n in enumerate(numbers):
	if n > x:
	break
	return idx - 1


	def bisect_search(x, numbers):
	i = bisect_left(numbers, x)
	if i:
	return i - 1
	raise ValueError


	@click.command()
	@click.argument('mode', type=click.Choice(['ranged', 'hash', 'bloom']))
	@click.option('--n', default=1000, type=int,
	help='Size of sorted array.')
	@click.option('--lr', default=9e-3, type=float,
	help='Learning rate of DL model (only parameter that matters!)')
	def main(mode, n, lr):
	"""CLI for creating machine learned index.
	"""
	N = n

	if mode == 'ranged':
	numbers, random_fun = sorted_numbers(N)
	elif mode == 'hash':
	raise NotImplementedError
	elif mode == 'bloom':
	raise NotImplementedError

	model = get_model()
	optimizer = torch.optim.Adam(model.parameters(), lr=lr)

	try:
	while True:

	batch_x = []; batch_y = []
	for _ in range(32):
	x, y = random_fun()
	batch_x.append(x)
	batch_y.append(y)

	batch_x = _featurize(batch_x)
	batch_y = _featurize(batch_y)

	pred = model(batch_x) * N

	output = F.smooth_l1_loss(pred, batch_y)
	loss = output.data[0]

	print(loss)

	optimizer.zero_grad()
	output.backward()
	optimizer.step()

	except KeyboardInterrupt:
	pass

	def _test(x):
	pred = model(_featurize([x])) * N
	# idx = naive_index_search(x, numbers)
	idx = bisect_search(x, numbers)
	print('Real:', idx, 'Predicted:', float(pred.data[0]))

	_test(1.5)

	import pdb
	pdb.set_trace()


	if __name__ == '__main__':
	main()