Skip to content

Instantly share code, notes, and snippets.

@Slater-Victoroff
Last active March 8, 2019 15:25
Show Gist options
  • Save Slater-Victoroff/848966185135bd9450a206e131b212b6 to your computer and use it in GitHub Desktop.
Save Slater-Victoroff/848966185135bd9450a206e131b212b6 to your computer and use it in GitHub Desktop.
20 questions with word embeddings
import json
from collections import OrderedDict
import numpy as np
from scipy.spatial.distance import cdist
from indicoio.custom import vectorize
from nouns import NOUNS
FEATURES = json.load(open("features.json"), object_pairs_hook=OrderedDict)
def featurize_nouns(outfile="features.json"):
"""
Turn noun list into text features.
Responsible for creating a json file that maps words to the features for that word.
"""
features = vectorize(NOUNS, batch_size=200) # To speed up, increase batch_size, to avoid Gateway Timeout Errors, reduce batch_size
result = {word: feature for word, feature in zip(NOUNS, features)}
json.dump(result, open(outfile, 'w'))
def _get_top_n_words_from_indices(scores, top_n, reverse=True):
"""Turn a list of scores into a list of words."""
if reverse:
return [list(FEATURES.keys())[index] for index in np.argsort(scores)[0][::-1][:top_n]]
else:
return [list(FEATURES.keys())[index] for index in np.argsort(scores)[0][:top_n]]
def ask_question(options):
"""
Pose a question to the user.
options should be the list of answer choices
"""
option_text = "".join(["\t%s: %s\n" % _ for _ in enumerate(options)]) # Format options for prompting
response = input("Which is closest to your word?\n" + option_text) # Prompt for input
answer_features = vectorize(options[int(response)]) # Vectorize selected response
# Compute distances between selected answer and all featurized nouns
distances = cdist([answer_features], list(FEATURES.values()), metric="cosine")
# Squeeze the 0 - 1 distances into an 0.5 - 1 space
distances /= 2
distances += 0.5
return distances
def closest_difference(*args, top_n): # kwarg after args means this is python3 only
"""
Find question options likely to help use choose between the most likely answers.
Input is arbitrary length list of terms to consider. Complexity is ~O(n^2), and generally
this approach will get fuzzier and less effective with too many considered terms.
"""
deltas = []
features = vectorize(list(args)) # Vectorize all args
for i, outer_arg in enumerate(args):
for j, inner_arg in enumerate(args):
if j > i: # Make sure that we're only computing novel combinations of terms
# For each combination of terms, append the difference in those terms to deltas
deltas.append(np.subtract(features[i], features[j]))
# Goal is to pick terms that are close to the entries in the deltas array.
# The idea is that embeddings that are close to A - B will be most helpful in
# Learning to differentiate between A and B.
distances = cdist(deltas, list(FEATURES.values()), metric="cosine") # Compute distances
# Reduce call to optimize for overall distance to all entries in the deltas array
# Multiplication used to bias selection toward singular values close to zero.
distances = np.multiply.reduce(distances, axis=0).reshape(-1, 1)
# Once we've found the best candidates, map them back to words so we can actually ask the question
return [list(FEATURES.keys())[i[0]] for i in np.argsort(distances, axis=0)[:top_n]]
def twenty_questions(base_questions, print_n=5, questions=5, answer_ply=10):
"""
Start prompting user for input and trying to determine the word they're thinking of
base_questions is a series of lists laying out some initialization questions to help the model along.
Good base_questions are high-level features like object color, size, texture, etc...
print_n refers to how many of the top confidence results should be printed out at the finish of
the routine. If things work perfectly a print_n of 1 should be perfectly acceptable.
questions is the number of questions to ask the user. Makes sense to increase this as the number
of options increases. 5 was plenty for the standard list of 1000 terms.
answer_ply is the number of answers to consider when creating the next question. Generally increasing
this number will lead to better performance. In earlier iterations it also may make sense to increase this
number as confidence that one of the top n guesses is actually correct will increase with time.
"""
scores = np.ones((len(FEATURES), 1))
for question in base_questions:
# For each question divide the starting score list by the distances returned by ask_question
# No need to normalize these values as we only ever use them for ranking,
# which is why a straight divide is acceptable.
scores = np.divide(scores, ask_question(question))
current_top = _get_top_n_words_from_indices(scores, answer_ply)
for _ in range(questions): # Run same process iteratively as confidences are updated
distances = ask_question(closest_difference(*current_top, top_n=5))
scores = np.divide(scores, distances)
current_top = _get_top_n_words_from_indices(distances, answer_ply, False)
# Print current most confident results once all questions have been asked.
print(_get_top_n_words_from_indices(scores, print_n))
if __name__ == "__main__":
# featurize_nouns()
# twenty_questions([
# ["animal", "mineral", "vegetable"],
# ["red", "blue", "green", "black", "white"],
# ["big", "medium", "small"],
# ["smooth", "rough"],
# ])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment