Last active
August 29, 2015 14:18
-
-
Save senderle/ca722f4097cc4fa010f8 to your computer and use it in GitHub Desktop.
A test to verify that the browsing similarity formula I derived (http://www.lagado.name/blog/?p=126) works
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import math | |
import collections | |
# This gist tests my derivation of browsing similarity. | |
# The browsing similarity determines the probability of moving | |
# from a node of type one to another node of type one in a | |
# fully-connected weighted bipartite network, given that the | |
# weights are interpreted as transition probabilities. | |
def browsing_similarity(t1, t2, texts): | |
t1_p = [tx[t1] for tx in texts] | |
t2_p = [tx[t2] for tx in texts] | |
dot_12 = sum(a * b for a, b in zip(t1_p, t2_p)) | |
return float(dot_12) / (sum(t2_p)) | |
def brute_similarity(t1, t2, texts): | |
# p(i, j, k) = p(k) * p(j|k) * p(i|j, k) reduces to | |
# p(i, j, k) = p(k) * p(j|k) * p(i|k) | |
probs = {} | |
for k, text in enumerate(texts): | |
for i, topic_a in enumerate(text): | |
for j, topic_b in enumerate(text): | |
probs[(i, j, k)] = topic_a * topic_b * 1.0 / len(texts) | |
numerator = 0.0 | |
for k, text in enumerate(texts): | |
numerator += probs[(t1, t2, k)] | |
denominator = 0.0 | |
for k, text in enumerate(texts): | |
for i, topic_a in enumerate(texts): | |
denominator += probs[(i, t2, k)] | |
return numerator / denominator | |
def categorical_sample(probs): | |
psum = 0 | |
x = random.random() | |
for i, p in enumerate(probs): | |
psum += p | |
if x < psum: | |
return i | |
def cs_est(t1, t2, texts, trials=1000): | |
t2_count = 0 | |
t1_count = 0 | |
while t2_count < trials: | |
tx = random.sample(texts, 1)[0] | |
sample = categorical_sample(tx) | |
if sample == t2: | |
t2_count += 1 | |
sample = categorical_sample(tx) | |
if sample == t1: | |
t1_count += 1 | |
return float(t1_count) / t2_count | |
def display_results(t1, t2, texts): | |
est_d = sorted(cs_est(t1, t2, texts, int(1000)) for i in range(1000)) | |
est = sum(est_d) / len(est_d) | |
top5p = int(float(len(est_d)) / 20) | |
est_trm = sum(est_d[top5p:-top5p]) / (len(est_d) - 2 * top5p) | |
geom_est = math.exp(sum(math.log(e) for e in est_d if e > 0) / len(est_d)) | |
sigma = (sum((e - est) ** 2 for e in est_d) / (len(est_d) - 1)) ** 0.5 | |
median = sorted(est_d)[int(len(est_d) / 2)] | |
cs = browsing_similarity(t1, t2, texts) | |
print "calculated:", cs | |
print "mean, error, sigma:", est, est - cs, sigma | |
print "trimmed_mean, error:", est_trm, est_trm - cs | |
print "median, error:", median, median - cs | |
print "geometric mean, error:", geom_est, geom_est - cs | |
if __name__ == '__main__': | |
topics = ['cheese', 'bread', 'pineapple', 'kombucha', 'oatcakes', 'chips', 'crackers', 'gingerale', 'biscuits', 'milk'] | |
texts = [[abs(random.gauss(0, 1)) for _ in xrange(len(topics) - 1)] for _ in range(10)] | |
for t in texts: | |
t.append(8) | |
texts = [[p / sum(tx) for p in tx] for tx in texts] | |
t1 = random.randrange(10) | |
t2 = random.randrange(10) | |
cs_test = browsing_similarity(t1, t2, texts) | |
bs_test = brute_similarity(t1, t2, texts) | |
print "Brute force formula agrees with derived formula to within at least fifteen orders of magnitude:" | |
print abs(cs_test - bs_test) < (10 ** -15) | |
print "Stochasticlly estimating browsing similarity from topic 1 to topic 2 using 1000 trials of 1000 transitions each." | |
print "..." | |
display_results(t1, t2, texts) | |
print "Stochasticlly estimating browsing similarity from topic 2 to topic 1 using 1000 trials of 1000 transitions each." | |
print "..." | |
display_results(t2, t1, texts) | |
all_sims = [browsing_similarity(t, t2, texts) for t in range(10)] | |
print "The browsing similarity for all topcs from a given topic should sum to 1.0:" | |
print sum(all_sims) | |
print all_sims |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment