senderle/sim_test.py

## sim_test.py
import random
import math
import collections

# This gist tests my derivation of browsing similarity.
# The browsing similarity determines the probability of moving
# from a node of type one to another node of type one in a
# fully-connected weighted bipartite network, given that the
# weights are interpreted as transition probabilities.

def browsing_similarity(t1, t2, texts):
    t1_p = [tx[t1] for tx in texts]
    t2_p = [tx[t2] for tx in texts]
    dot_12 = sum(a * b for a, b in zip(t1_p, t2_p))
    return float(dot_12) / (sum(t2_p))

def brute_similarity(t1, t2, texts):
    # p(i, j, k) = p(k) * p(j|k) * p(i|j, k) reduces to
    # p(i, j, k) = p(k) * p(j|k) * p(i|k)

    probs = {}
    for k, text in enumerate(texts):
        for i, topic_a in enumerate(text):
            for j, topic_b in enumerate(text):
                probs[(i, j, k)] = topic_a * topic_b * 1.0 / len(texts)

    numerator = 0.0
    for k, text in enumerate(texts):
        numerator += probs[(t1, t2, k)]

    denominator = 0.0
    for k, text in enumerate(texts):
        for i, topic_a in enumerate(texts):
            denominator += probs[(i, t2, k)]
    return numerator / denominator

def categorical_sample(probs):
    psum = 0
    x = random.random()
    for i, p in enumerate(probs):
        psum += p
        if x < psum:
            return i

def cs_est(t1, t2, texts, trials=1000):
    t2_count = 0
    t1_count = 0
    while t2_count < trials:
        tx = random.sample(texts, 1)[0]
        sample = categorical_sample(tx)
        if sample == t2:
            t2_count += 1
            sample = categorical_sample(tx)
            if sample == t1:
                t1_count += 1
    return float(t1_count) / t2_count

def display_results(t1, t2, texts):
    est_d = sorted(cs_est(t1, t2, texts, int(1000)) for i in range(1000))
    est = sum(est_d) / len(est_d)
    top5p = int(float(len(est_d)) / 20)
    est_trm = sum(est_d[top5p:-top5p]) / (len(est_d) - 2 * top5p)
    geom_est = math.exp(sum(math.log(e) for e in est_d if e > 0) / len(est_d))
    sigma = (sum((e - est) ** 2 for e in est_d) / (len(est_d) - 1)) ** 0.5
    median = sorted(est_d)[int(len(est_d) / 2)]
    cs = browsing_similarity(t1, t2, texts)
    print "calculated:", cs
    print "mean, error, sigma:", est, est - cs, sigma
    print "trimmed_mean, error:", est_trm, est_trm - cs
    print "median, error:", median, median - cs
    print "geometric mean, error:", geom_est, geom_est - cs

if __name__ == '__main__':
    topics = ['cheese', 'bread', 'pineapple', 'kombucha', 'oatcakes', 'chips', 'crackers', 'gingerale', 'biscuits', 'milk']
    texts = [[abs(random.gauss(0, 1)) for _ in xrange(len(topics) - 1)] for _ in range(10)]
    for t in texts:
        t.append(8)
    texts = [[p / sum(tx) for p in tx] for tx in texts]

    t1 = random.randrange(10)
    t2 = random.randrange(10)

    cs_test = browsing_similarity(t1, t2, texts)
    bs_test = brute_similarity(t1, t2, texts)
    print "Brute force formula agrees with derived formula to within at least fifteen orders of magnitude:"
    print abs(cs_test - bs_test) < (10 ** -15)


    print
    print "Stochasticlly estimating browsing similarity from topic 1 to topic 2 using 1000 trials of 1000 transitions each."
    print "..."
    print
    display_results(t1, t2, texts)


    print
    print "Stochasticlly estimating browsing similarity from topic 2 to topic 1 using 1000 trials of 1000 transitions each."
    print "..."
    print
    display_results(t2, t1, texts)

    print
    all_sims = [browsing_similarity(t, t2, texts) for t in range(10)]
    print "The browsing similarity for all topcs from a given topic should sum to 1.0:"
    print sum(all_sims)
    print all_sims
	import random
	import math
	import collections

	# This gist tests my derivation of browsing similarity.
	# The browsing similarity determines the probability of moving
	# from a node of type one to another node of type one in a
	# fully-connected weighted bipartite network, given that the
	# weights are interpreted as transition probabilities.

	def browsing_similarity(t1, t2, texts):
	t1_p = [tx[t1] for tx in texts]
	t2_p = [tx[t2] for tx in texts]
	dot_12 = sum(a * b for a, b in zip(t1_p, t2_p))
	return float(dot_12) / (sum(t2_p))

	def brute_similarity(t1, t2, texts):
	# p(i, j, k) = p(k) * p(j\|k) * p(i\|j, k) reduces to
	# p(i, j, k) = p(k) * p(j\|k) * p(i\|k)

	probs = {}
	for k, text in enumerate(texts):
	for i, topic_a in enumerate(text):
	for j, topic_b in enumerate(text):
	probs[(i, j, k)] = topic_a * topic_b * 1.0 / len(texts)

	numerator = 0.0
	for k, text in enumerate(texts):
	numerator += probs[(t1, t2, k)]

	denominator = 0.0
	for k, text in enumerate(texts):
	for i, topic_a in enumerate(texts):
	denominator += probs[(i, t2, k)]
	return numerator / denominator

	def categorical_sample(probs):
	psum = 0
	x = random.random()
	for i, p in enumerate(probs):
	psum += p
	if x < psum:
	return i

	def cs_est(t1, t2, texts, trials=1000):
	t2_count = 0
	t1_count = 0
	while t2_count < trials:
	tx = random.sample(texts, 1)[0]
	sample = categorical_sample(tx)
	if sample == t2:
	t2_count += 1
	sample = categorical_sample(tx)
	if sample == t1:
	t1_count += 1
	return float(t1_count) / t2_count

	def display_results(t1, t2, texts):
	est_d = sorted(cs_est(t1, t2, texts, int(1000)) for i in range(1000))
	est = sum(est_d) / len(est_d)
	top5p = int(float(len(est_d)) / 20)
	est_trm = sum(est_d[top5p:-top5p]) / (len(est_d) - 2 * top5p)
	geom_est = math.exp(sum(math.log(e) for e in est_d if e > 0) / len(est_d))
	sigma = (sum((e - est) 2 for e in est_d) / (len(est_d) - 1)) 0.5
	median = sorted(est_d)[int(len(est_d) / 2)]
	cs = browsing_similarity(t1, t2, texts)
	print "calculated:", cs
	print "mean, error, sigma:", est, est - cs, sigma
	print "trimmed_mean, error:", est_trm, est_trm - cs
	print "median, error:", median, median - cs
	print "geometric mean, error:", geom_est, geom_est - cs

	if __name__ == '__main__':
	topics = ['cheese', 'bread', 'pineapple', 'kombucha', 'oatcakes', 'chips', 'crackers', 'gingerale', 'biscuits', 'milk']
	texts = [[abs(random.gauss(0, 1)) for _ in xrange(len(topics) - 1)] for _ in range(10)]
	for t in texts:
	t.append(8)
	texts = [[p / sum(tx) for p in tx] for tx in texts]

	t1 = random.randrange(10)
	t2 = random.randrange(10)

	cs_test = browsing_similarity(t1, t2, texts)
	bs_test = brute_similarity(t1, t2, texts)
	print "Brute force formula agrees with derived formula to within at least fifteen orders of magnitude:"
	print abs(cs_test - bs_test) < (10 ** -15)


	print
	print "Stochasticlly estimating browsing similarity from topic 1 to topic 2 using 1000 trials of 1000 transitions each."
	print "..."
	print
	display_results(t1, t2, texts)


	print
	print "Stochasticlly estimating browsing similarity from topic 2 to topic 1 using 1000 trials of 1000 transitions each."
	print "..."
	print
	display_results(t2, t1, texts)

	print
	all_sims = [browsing_similarity(t, t2, texts) for t in range(10)]
	print "The browsing similarity for all topcs from a given topic should sum to 1.0:"
	print sum(all_sims)
	print all_sims