Skip to content

Instantly share code, notes, and snippets.

@senderle
Last active August 29, 2015 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save senderle/ca722f4097cc4fa010f8 to your computer and use it in GitHub Desktop.
Save senderle/ca722f4097cc4fa010f8 to your computer and use it in GitHub Desktop.
A test to verify that the browsing similarity formula I derived (http://www.lagado.name/blog/?p=126) works
import random
import math
import collections
# This gist tests my derivation of browsing similarity.
# The browsing similarity determines the probability of moving
# from a node of type one to another node of type one in a
# fully-connected weighted bipartite network, given that the
# weights are interpreted as transition probabilities.
def browsing_similarity(t1, t2, texts):
t1_p = [tx[t1] for tx in texts]
t2_p = [tx[t2] for tx in texts]
dot_12 = sum(a * b for a, b in zip(t1_p, t2_p))
return float(dot_12) / (sum(t2_p))
def brute_similarity(t1, t2, texts):
# p(i, j, k) = p(k) * p(j|k) * p(i|j, k) reduces to
# p(i, j, k) = p(k) * p(j|k) * p(i|k)
probs = {}
for k, text in enumerate(texts):
for i, topic_a in enumerate(text):
for j, topic_b in enumerate(text):
probs[(i, j, k)] = topic_a * topic_b * 1.0 / len(texts)
numerator = 0.0
for k, text in enumerate(texts):
numerator += probs[(t1, t2, k)]
denominator = 0.0
for k, text in enumerate(texts):
for i, topic_a in enumerate(texts):
denominator += probs[(i, t2, k)]
return numerator / denominator
def categorical_sample(probs):
psum = 0
x = random.random()
for i, p in enumerate(probs):
psum += p
if x < psum:
return i
def cs_est(t1, t2, texts, trials=1000):
t2_count = 0
t1_count = 0
while t2_count < trials:
tx = random.sample(texts, 1)[0]
sample = categorical_sample(tx)
if sample == t2:
t2_count += 1
sample = categorical_sample(tx)
if sample == t1:
t1_count += 1
return float(t1_count) / t2_count
def display_results(t1, t2, texts):
est_d = sorted(cs_est(t1, t2, texts, int(1000)) for i in range(1000))
est = sum(est_d) / len(est_d)
top5p = int(float(len(est_d)) / 20)
est_trm = sum(est_d[top5p:-top5p]) / (len(est_d) - 2 * top5p)
geom_est = math.exp(sum(math.log(e) for e in est_d if e > 0) / len(est_d))
sigma = (sum((e - est) ** 2 for e in est_d) / (len(est_d) - 1)) ** 0.5
median = sorted(est_d)[int(len(est_d) / 2)]
cs = browsing_similarity(t1, t2, texts)
print "calculated:", cs
print "mean, error, sigma:", est, est - cs, sigma
print "trimmed_mean, error:", est_trm, est_trm - cs
print "median, error:", median, median - cs
print "geometric mean, error:", geom_est, geom_est - cs
if __name__ == '__main__':
topics = ['cheese', 'bread', 'pineapple', 'kombucha', 'oatcakes', 'chips', 'crackers', 'gingerale', 'biscuits', 'milk']
texts = [[abs(random.gauss(0, 1)) for _ in xrange(len(topics) - 1)] for _ in range(10)]
for t in texts:
t.append(8)
texts = [[p / sum(tx) for p in tx] for tx in texts]
t1 = random.randrange(10)
t2 = random.randrange(10)
cs_test = browsing_similarity(t1, t2, texts)
bs_test = brute_similarity(t1, t2, texts)
print "Brute force formula agrees with derived formula to within at least fifteen orders of magnitude:"
print abs(cs_test - bs_test) < (10 ** -15)
print
print "Stochasticlly estimating browsing similarity from topic 1 to topic 2 using 1000 trials of 1000 transitions each."
print "..."
print
display_results(t1, t2, texts)
print
print "Stochasticlly estimating browsing similarity from topic 2 to topic 1 using 1000 trials of 1000 transitions each."
print "..."
print
display_results(t2, t1, texts)
print
all_sims = [browsing_similarity(t, t2, texts) for t in range(10)]
print "The browsing similarity for all topcs from a given topic should sum to 1.0:"
print sum(all_sims)
print all_sims
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment