catherio/sandbox.py

## sandbox.py
from csc import divisi2

# Build conceptnet
cnet_graph = divisi2.load('data:graphs/conceptnet_en.graph')
A = divisi2.network.sparse_matrix(cnet_graph, 'nodes', 'features', cutoff=5)

# Build analogyspace
U,S,V= A.normalize_all().svd()
predictions = divisi2.reconstruct(U,S,V) # lazy
#A_approx = divisi2.dot(U*S, V.T) # the 'real deal', very slow if A is big

# Do another SVD on U to find the weighted concept-vector (ie, the feature)
# of the most variance.
# ("efspace" is "eigenfeaturespace")
# (to_sparse() is so that it uses cleverer SVD math)
(efspaceU, efspaceS, efspaceV) = U.normalize_all().to_sparse().svd(1)
most_variance_feature = efspaceU.T[0]

# Dot product this eigenfeature, through V*S*U^T, with all other features,
# to find how much each natural-language-feature "matches" the eigenfeature
closest_features = V.dot(U.T.dot(most_variance_feature).multiply(S))

# The first question we will ask is the natural-language-feature which
# is closest to the direction of most variance.
first_question = closest_features.top_items(1)[0][0]


# GAME PLAN:
#
# At any stage of the game, we have a matrix of concepts-to-features,
# with the concept vectors weighted by how much they've been matching
# at this stage of the game. I call these "w-concepts" for "weighted concepts"

# ACTUAL DETAILS ARE FROM DIVISI, NOT DIVISI2!
# 1. Do an SVD of analogyspace over features to find the first eigenfeature
#  - normalized or not?
#     -> In Divisi2, normalize_all method
#  - how can I get at the v's?
#     -> .v, then SVD the way you would conceptnet
#     -> v, not weighted_v, because if weighted with sigma then output re-SVD'd would not necessarily be the same (THIS IS NOT ACTUALLY RELEVANT)
#  - how can I see the axes, in order, of an SVD2DResults?
#     -> .summarize() will print them, or .summarize_axis(0) for the first axis
#     -> Try efspace.u[:, 0].top_items(1)[0][0]
#     -> For opposities, do efspace.u[:,0].top_items(1, largest=False)[0][0]

# IDEA: flip between top and bottom items, and vary the first "0" to 1, 2, to grab nearby possibilities.
#  - NEEDS ANSWER: how can I get the thing pointing in most the right DIRECTION? Isn't that the one that's most like the eigenfeature? Do I want weighted? Weighted seems much more dramatically different
#     -> strong, off-angle one has more information about it
#     -> small, on-angle one is less known
#     -> try both, but probably the first one
#  - why are its dimensions 12976 by 121488, not 100 by something? still don't quite understand

# 2. Ask about the first eigenfeature
#    -> the commands above give questions like ('right', u'HasProperty', u'bad'), which are already in near-perfect format

# 3. Given a yes/no answer about the eigenfeature asked, choose a weight
#    function which takes a magnitude and returns either a sigmoid or a
#    backwards sigmoid of that magnitude. (T/F -> function)
#    - need SciPy to get the erf function, methinks
# Idea: "probably" and "probably not" can trigger wider sigmoid functions

# For each concept:
# 4. Calculate the projection of the w-concept along the feature asked.
#    -> How to turn question into a vector:
#       vectorq = s.analogyspace_norm.weighted_v[question,:]
#    -> How to calculate projection:

# 5. Subtract out the projection from the w-concept.
#    -> Ken is making this be clever for large, dense matrices
#    -> For now, use a smaller segment of conceptnet
# 6. Calculate the weight function for the magnitude of the projection, and
#    weight w-concept vector by the result.
# Repeat!
	from csc import divisi2

	# Build conceptnet
	cnet_graph = divisi2.load('data:graphs/conceptnet_en.graph')
	A = divisi2.network.sparse_matrix(cnet_graph, 'nodes', 'features', cutoff=5)

	# Build analogyspace
	U,S,V= A.normalize_all().svd()
	predictions = divisi2.reconstruct(U,S,V) # lazy
	#A_approx = divisi2.dot(U*S, V.T) # the 'real deal', very slow if A is big

	# Do another SVD on U to find the weighted concept-vector (ie, the feature)
	# of the most variance.
	# ("efspace" is "eigenfeaturespace")
	# (to_sparse() is so that it uses cleverer SVD math)
	(efspaceU, efspaceS, efspaceV) = U.normalize_all().to_sparse().svd(1)
	most_variance_feature = efspaceU.T[0]

	# Dot product this eigenfeature, through VSU^T, with all other features,
	# to find how much each natural-language-feature "matches" the eigenfeature
	closest_features = V.dot(U.T.dot(most_variance_feature).multiply(S))

	# The first question we will ask is the natural-language-feature which
	# is closest to the direction of most variance.
	first_question = closest_features.top_items(1)[0][0]


	# GAME PLAN:
	#
	# At any stage of the game, we have a matrix of concepts-to-features,
	# with the concept vectors weighted by how much they've been matching
	# at this stage of the game. I call these "w-concepts" for "weighted concepts"

	# ACTUAL DETAILS ARE FROM DIVISI, NOT DIVISI2!
	# 1. Do an SVD of analogyspace over features to find the first eigenfeature
	# - normalized or not?
	# -> In Divisi2, normalize_all method
	# - how can I get at the v's?
	# -> .v, then SVD the way you would conceptnet
	# -> v, not weighted_v, because if weighted with sigma then output re-SVD'd would not necessarily be the same (THIS IS NOT ACTUALLY RELEVANT)
	# - how can I see the axes, in order, of an SVD2DResults?
	# -> .summarize() will print them, or .summarize_axis(0) for the first axis
	# -> Try efspace.u[:, 0].top_items(1)[0][0]
	# -> For opposities, do efspace.u[:,0].top_items(1, largest=False)[0][0]

	# IDEA: flip between top and bottom items, and vary the first "0" to 1, 2, to grab nearby possibilities.
	# - NEEDS ANSWER: how can I get the thing pointing in most the right DIRECTION? Isn't that the one that's most like the eigenfeature? Do I want weighted? Weighted seems much more dramatically different
	# -> strong, off-angle one has more information about it
	# -> small, on-angle one is less known
	# -> try both, but probably the first one
	# - why are its dimensions 12976 by 121488, not 100 by something? still don't quite understand

	# 2. Ask about the first eigenfeature
	# -> the commands above give questions like ('right', u'HasProperty', u'bad'), which are already in near-perfect format

	# 3. Given a yes/no answer about the eigenfeature asked, choose a weight
	# function which takes a magnitude and returns either a sigmoid or a
	# backwards sigmoid of that magnitude. (T/F -> function)
	# - need SciPy to get the erf function, methinks
	# Idea: "probably" and "probably not" can trigger wider sigmoid functions

	# For each concept:
	# 4. Calculate the projection of the w-concept along the feature asked.
	# -> How to turn question into a vector:
	# vectorq = s.analogyspace_norm.weighted_v[question,:]
	# -> How to calculate projection:

	# 5. Subtract out the projection from the w-concept.
	# -> Ken is making this be clever for large, dense matrices
	# -> For now, use a smaller segment of conceptnet
	# 6. Calculate the weight function for the magnitude of the projection, and
	# weight w-concept vector by the result.
	# Repeat!