Slater-Victoroff/twentyqs.py

## twentyqs.py
import json
from collections import OrderedDict

import numpy as np
from scipy.spatial.distance import cdist
from indicoio.custom import vectorize
from nouns import NOUNS

FEATURES = json.load(open("features.json"), object_pairs_hook=OrderedDict)

def featurize_nouns(outfile="features.json"):
  """
  Turn noun list into text features.

  Responsible for creating a json file that maps words to the features for that word.
  """
  features = vectorize(NOUNS, batch_size=200)  # To speed up, increase batch_size, to avoid Gateway Timeout Errors, reduce batch_size
  result = {word: feature for word, feature in zip(NOUNS, features)}
  json.dump(result, open(outfile, 'w'))

def _get_top_n_words_from_indices(scores, top_n, reverse=True):
  """Turn a list of scores into a list of words."""
  if reverse:
    return [list(FEATURES.keys())[index] for index in np.argsort(scores)[0][::-1][:top_n]]
  else:
    return [list(FEATURES.keys())[index] for index in np.argsort(scores)[0][:top_n]]

def ask_question(options):
  """
  Pose a question to the user.

  options should be the list of answer choices
  """
  option_text = "".join(["\t%s: %s\n" % _ for _ in enumerate(options)])  # Format options for prompting
  response = input("Which is closest to your word?\n" + option_text)  # Prompt for input
  answer_features = vectorize(options[int(response)])  # Vectorize selected response
  # Compute distances between selected answer and all featurized nouns
  distances = cdist([answer_features], list(FEATURES.values()), metric="cosine")
  # Squeeze the 0 - 1 distances into an 0.5 - 1 space
  distances /= 2
  distances += 0.5
  return distances

def closest_difference(*args, top_n):  # kwarg after args means this is python3 only
  """
  Find question options likely to help use choose between the most likely answers.

  Input is arbitrary length list of terms to consider. Complexity is ~O(n^2), and generally
  this approach will get fuzzier and less effective with too many considered terms.
  """
  deltas = []
  features = vectorize(list(args))  # Vectorize all args
  for i, outer_arg in enumerate(args):
    for j, inner_arg in enumerate(args):
      if j > i:  # Make sure that we're only computing novel combinations of terms
        # For each combination of terms, append the difference in those terms to deltas
        deltas.append(np.subtract(features[i], features[j]))
  # Goal is to pick terms that are close to the entries in the deltas array.
  # The idea is that embeddings that are close to A - B will be most helpful in
  # Learning to differentiate between A and B.
  distances = cdist(deltas, list(FEATURES.values()), metric="cosine")  # Compute distances
  # Reduce call to optimize for overall distance to all entries in the deltas array
  # Multiplication used to bias selection toward singular values close to zero.
  distances = np.multiply.reduce(distances, axis=0).reshape(-1, 1)
  # Once we've found the best candidates, map them back to words so we can actually ask the question
  return [list(FEATURES.keys())[i[0]] for i in np.argsort(distances, axis=0)[:top_n]]

def twenty_questions(base_questions, print_n=5, questions=5, answer_ply=10):
  """
  Start prompting user for input and trying to determine the word they're thinking of

  base_questions is a series of lists laying out some initialization questions to help the model along.
  Good base_questions are high-level features like object color, size, texture, etc...

  print_n refers to how many of the top confidence results should be printed out at the finish of
  the routine. If things work perfectly a print_n of 1 should be perfectly acceptable.

  questions is the number of questions to ask the user. Makes sense to increase this as the number
  of options increases. 5 was plenty for the standard list of 1000 terms.

  answer_ply is the number of answers to consider when creating the next question. Generally increasing
  this number will lead to better performance. In earlier iterations it also may make sense to increase this
  number as confidence that one of the top n guesses is actually correct will increase with time.
  """
  scores = np.ones((len(FEATURES), 1))
  for question in base_questions:
    # For each question divide the starting score list by the distances returned by ask_question
    # No need to normalize these values as we only ever use them for ranking,
    # which is why a straight divide is acceptable.
    scores = np.divide(scores, ask_question(question))
  current_top = _get_top_n_words_from_indices(scores, answer_ply)
  for _ in range(questions):  # Run same process iteratively as confidences are updated
    distances = ask_question(closest_difference(*current_top, top_n=5))
    scores = np.divide(scores, distances)
    current_top = _get_top_n_words_from_indices(distances, answer_ply, False)
  # Print current most confident results once all questions have been asked.
  print(_get_top_n_words_from_indices(scores, print_n))

if __name__ == "__main__":
  # featurize_nouns()
  # twenty_questions([
  #   ["animal", "mineral", "vegetable"],
  #   ["red", "blue", "green", "black", "white"],
  #   ["big", "medium", "small"],
  #   ["smooth", "rough"],
  # ])
	import json
	from collections import OrderedDict

	import numpy as np
	from scipy.spatial.distance import cdist
	from indicoio.custom import vectorize
	from nouns import NOUNS

	FEATURES = json.load(open("features.json"), object_pairs_hook=OrderedDict)

	def featurize_nouns(outfile="features.json"):
	"""
	Turn noun list into text features.

	Responsible for creating a json file that maps words to the features for that word.
	"""
	features = vectorize(NOUNS, batch_size=200) # To speed up, increase batch_size, to avoid Gateway Timeout Errors, reduce batch_size
	result = {word: feature for word, feature in zip(NOUNS, features)}
	json.dump(result, open(outfile, 'w'))

	def _get_top_n_words_from_indices(scores, top_n, reverse=True):
	"""Turn a list of scores into a list of words."""
	if reverse:
	return [list(FEATURES.keys())[index] for index in np.argsort(scores)[0][::-1][:top_n]]
	else:
	return [list(FEATURES.keys())[index] for index in np.argsort(scores)[0][:top_n]]

	def ask_question(options):
	"""
	Pose a question to the user.

	options should be the list of answer choices
	"""
	option_text = "".join(["\t%s: %s\n" % _ for _ in enumerate(options)]) # Format options for prompting
	response = input("Which is closest to your word?\n" + option_text) # Prompt for input
	answer_features = vectorize(options[int(response)]) # Vectorize selected response
	# Compute distances between selected answer and all featurized nouns
	distances = cdist([answer_features], list(FEATURES.values()), metric="cosine")
	# Squeeze the 0 - 1 distances into an 0.5 - 1 space
	distances /= 2
	distances += 0.5
	return distances

	def closest_difference(*args, top_n): # kwarg after args means this is python3 only
	"""
	Find question options likely to help use choose between the most likely answers.

	Input is arbitrary length list of terms to consider. Complexity is ~O(n^2), and generally
	this approach will get fuzzier and less effective with too many considered terms.
	"""
	deltas = []
	features = vectorize(list(args)) # Vectorize all args
	for i, outer_arg in enumerate(args):
	for j, inner_arg in enumerate(args):
	if j > i: # Make sure that we're only computing novel combinations of terms
	# For each combination of terms, append the difference in those terms to deltas
	deltas.append(np.subtract(features[i], features[j]))
	# Goal is to pick terms that are close to the entries in the deltas array.
	# The idea is that embeddings that are close to A - B will be most helpful in
	# Learning to differentiate between A and B.
	distances = cdist(deltas, list(FEATURES.values()), metric="cosine") # Compute distances
	# Reduce call to optimize for overall distance to all entries in the deltas array
	# Multiplication used to bias selection toward singular values close to zero.
	distances = np.multiply.reduce(distances, axis=0).reshape(-1, 1)
	# Once we've found the best candidates, map them back to words so we can actually ask the question
	return [list(FEATURES.keys())[i[0]] for i in np.argsort(distances, axis=0)[:top_n]]

	def twenty_questions(base_questions, print_n=5, questions=5, answer_ply=10):
	"""
	Start prompting user for input and trying to determine the word they're thinking of

	base_questions is a series of lists laying out some initialization questions to help the model along.
	Good base_questions are high-level features like object color, size, texture, etc...

	print_n refers to how many of the top confidence results should be printed out at the finish of
	the routine. If things work perfectly a print_n of 1 should be perfectly acceptable.

	questions is the number of questions to ask the user. Makes sense to increase this as the number
	of options increases. 5 was plenty for the standard list of 1000 terms.

	answer_ply is the number of answers to consider when creating the next question. Generally increasing
	this number will lead to better performance. In earlier iterations it also may make sense to increase this
	number as confidence that one of the top n guesses is actually correct will increase with time.
	"""
	scores = np.ones((len(FEATURES), 1))
	for question in base_questions:
	# For each question divide the starting score list by the distances returned by ask_question
	# No need to normalize these values as we only ever use them for ranking,
	# which is why a straight divide is acceptable.
	scores = np.divide(scores, ask_question(question))
	current_top = _get_top_n_words_from_indices(scores, answer_ply)
	for _ in range(questions): # Run same process iteratively as confidences are updated
	distances = ask_question(closest_difference(*current_top, top_n=5))
	scores = np.divide(scores, distances)
	current_top = _get_top_n_words_from_indices(distances, answer_ply, False)
	# Print current most confident results once all questions have been asked.
	print(_get_top_n_words_from_indices(scores, print_n))

	if __name__ == "__main__":
	# featurize_nouns()
	# twenty_questions([
	# ["animal", "mineral", "vegetable"],
	# ["red", "blue", "green", "black", "white"],
	# ["big", "medium", "small"],
	# ["smooth", "rough"],
	# ])