emaadmanzoor/AttentionPotentialValidation.md

## AttentionPotentialValidation.md

      
    Raw
  

              AttentionPotentialValidation.md
            
          
    See the project website for more details.
Please report any issues to emaadahmed.manzoor@kaust.edu.sa.
Correlation Results

The attention potential (as estimated in section 4), when evaluated on this Twitter dataset:

Is 73.61% correlated with the retweets obtained.
Is significantly correlated (p < 0.05).
Is correlated with a 95% confidence interval of [72.72%, 74.48%].

Execution

Running this requires having the following files in the same directory as validate_ap.py:

all_links.p
all_tweets.p

It is run by a simple Python call: python ap_validation.py
The output is tab-separated, with the columns as follows:
producer_id | follower_id | number_of_reactions | attention_potential | gamma | delta | rho

The output can then be processed to evaluate the correlation between the number
of reactions and attention potential. The following code does this in R:
mydata <- read.csv('output.txt', sep='\t')
names(mydata) <- c('pid', 'fid', 'nreactions', 'ap', 'gamma', 'delta', 'rho')
nonzero <- subset(mydata, nreactions > 0 | ap > 0)
cor.test(nonzero$nreactions, nonzero$ap)
Output:
	Pearson's product-moment correlation

data:  nonzero$nreactions and nonzero$ap
t = 111.3336, df = 10478, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.7272485 0.7447932
sample estimates:
      cor 
0.7361445 


## validate_ap.py
#!/usr/bin/env python

import cPickle as pickle
from itertools import groupby
from multiprocessing import Pool

LOGIN_THRESHOLD = 8 # hours

def calculate_ap(producer, producer_tweets,
                 follower, tweets, links):

    # all the follower's tweets
    follower_tweets = [tweet for tweet in tweets
                       if tweet[0] == follower]

    if len(follower_tweets) == 0:
        return 0, 0.0, 0, 0, 0

    # all the competitors' tweets
    competitors = set([link[1] for link in links if link[0] == follower])
    competitor_tweets = [tweet for tweet in tweets
                         if tweet[0] in competitors]

    # estimate gamma: prior prob of follower reaction
    #                 this is a user-define weight
    follower_reactions = [tweet for tweet in follower_tweets
                          if tweet[2] > 0 or tweet[3] > 0]
    gamma = len(follower_reactions) / float(len(follower_tweets))

    # estimate delta: prob of producer's tweet getting a follower reaction
    #                 this is the usual measure of (directed) tie strength
    follower_reactions_to_producer = [tweet for tweet in follower_reactions
                                      if tweet[2] == producer or tweet[3] == producer]
    delta = len(follower_reactions_to_producer) / float(len(producer_tweets))

    if gamma == 0 or delta == 0:
        return len(follower_reactions_to_producer), 0.0, 0, 0, 0

    # estimate rho: prob of leaving the timeline after reading a post
    #
    #               rho is the parameter of a geometric distribution
    #               it is estimated using the average number of posts
    #               consumed per login
    #
    #               a tweet is "consumed" if it is reacted to
    #               a "login" is counted for a followers tweet
    #               occuring after a gap of 8 hours (§2.2)
    num_logins = 0
    prev = follower_tweets[0][1] / 1000
    for follower_tweet in follower_tweets[1:]:
        diff = (follower_tweet[1] / 1000 - prev) / 3600. # hours
        if diff >= LOGIN_THRESHOLD:
            num_logins += 1
        prev = follower_tweet[1] / 1000

    mu = len(follower_reactions) / float(num_logins)
    rho = 1. / (1 + mu)

    # reconstruct timeline
    timeline = []
    timeline += [(t[1]/1000, 2) for t in follower_tweets]
    timeline += [(t[1]/1000, 1) for t in competitor_tweets]
    timeline += [(t[1]/1000, 0) for t in producer_tweets]
    timeline = sorted(timeline, reverse=True)

    # calculate ap: geometric user/cluster survival fns
    ap = 0.0
    depth = 0
    cluster_size = 0
    cluster_ap = 0
    for post in timeline:
        if post[1] == 2: # follower
            depth = 0
            continue

        depth += 1

        if post[1] == 1: # competitor
            ap += (delta ** cluster_size) * cluster_ap
            cluster_size = 0
            cluster_ap = 0
            continue

        if post[1] == 0: # producer
            cluster_size += 1
            cluster_ap += (1 - rho) ** depth
            continue

    # toggle comment for with and without gamma
    ap *= gamma

    return len(follower_reactions_to_producer), ap, gamma, delta, rho

def main():
    tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
    links = sorted(pickle.load(open('all_links.p', 'rb')))

    # multiprocessing
    pool = Pool(8)
    results = [] # (producer, follower, #rts, #ap)

    for producer, producer_links in groupby(links, key=lambda x: x[0]):
        producer_tweets = [tweet for tweet in tweets
                           if tweet[0] == producer]

        if len(producer_tweets) == 0:
            continue

        # calculate the total attention potential of the producer
        for producer_link in producer_links:
            follower = producer_link[1]

            results.append((producer, follower,
                            pool.apply_async(calculate_ap,
                                             (producer, producer_tweets,
                                              follower, tweets, links))))

    for producer, follower, r in results:
        producer_rts, producer_ap, gamma, delta, rho = r.get()
        print producer, '\t', follower, '\t', producer_rts, '\t',
        print "%.4f\t%.4f\t%.4f\t%.4f" % (producer_ap, gamma, delta, rho)

if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	import cPickle as pickle
	from itertools import groupby
	from multiprocessing import Pool

	LOGIN_THRESHOLD = 8 # hours

	def calculate_ap(producer, producer_tweets,
	follower, tweets, links):

	# all the follower's tweets
	follower_tweets = [tweet for tweet in tweets
	if tweet[0] == follower]

	if len(follower_tweets) == 0:
	return 0, 0.0, 0, 0, 0

	# all the competitors' tweets
	competitors = set([link[1] for link in links if link[0] == follower])
	competitor_tweets = [tweet for tweet in tweets
	if tweet[0] in competitors]

	# estimate gamma: prior prob of follower reaction
	# this is a user-define weight
	follower_reactions = [tweet for tweet in follower_tweets
	if tweet[2] > 0 or tweet[3] > 0]
	gamma = len(follower_reactions) / float(len(follower_tweets))

	# estimate delta: prob of producer's tweet getting a follower reaction
	# this is the usual measure of (directed) tie strength
	follower_reactions_to_producer = [tweet for tweet in follower_reactions
	if tweet[2] == producer or tweet[3] == producer]
	delta = len(follower_reactions_to_producer) / float(len(producer_tweets))

	if gamma == 0 or delta == 0:
	return len(follower_reactions_to_producer), 0.0, 0, 0, 0

	# estimate rho: prob of leaving the timeline after reading a post
	#
	# rho is the parameter of a geometric distribution
	# it is estimated using the average number of posts
	# consumed per login
	#
	# a tweet is "consumed" if it is reacted to
	# a "login" is counted for a followers tweet
	# occuring after a gap of 8 hours (§2.2)
	num_logins = 0
	prev = follower_tweets[0][1] / 1000
	for follower_tweet in follower_tweets[1:]:
	diff = (follower_tweet[1] / 1000 - prev) / 3600. # hours
	if diff >= LOGIN_THRESHOLD:
	num_logins += 1
	prev = follower_tweet[1] / 1000

	mu = len(follower_reactions) / float(num_logins)
	rho = 1. / (1 + mu)

	# reconstruct timeline
	timeline = []
	timeline += [(t[1]/1000, 2) for t in follower_tweets]
	timeline += [(t[1]/1000, 1) for t in competitor_tweets]
	timeline += [(t[1]/1000, 0) for t in producer_tweets]
	timeline = sorted(timeline, reverse=True)

	# calculate ap: geometric user/cluster survival fns
	ap = 0.0
	depth = 0
	cluster_size = 0
	cluster_ap = 0
	for post in timeline:
	if post[1] == 2: # follower
	depth = 0
	continue

	depth += 1

	if post[1] == 1: # competitor
	ap += (delta ** cluster_size) * cluster_ap
	cluster_size = 0
	cluster_ap = 0
	continue

	if post[1] == 0: # producer
	cluster_size += 1
	cluster_ap += (1 - rho) ** depth
	continue

	# toggle comment for with and without gamma
	ap *= gamma

	return len(follower_reactions_to_producer), ap, gamma, delta, rho

	def main():
	tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
	links = sorted(pickle.load(open('all_links.p', 'rb')))

	# multiprocessing
	pool = Pool(8)
	results = [] # (producer, follower, #rts, #ap)

	for producer, producer_links in groupby(links, key=lambda x: x[0]):
	producer_tweets = [tweet for tweet in tweets
	if tweet[0] == producer]

	if len(producer_tweets) == 0:
	continue

	# calculate the total attention potential of the producer
	for producer_link in producer_links:
	follower = producer_link[1]

	results.append((producer, follower,
	pool.apply_async(calculate_ap,
	(producer, producer_tweets,
	follower, tweets, links))))

	for producer, follower, r in results:
	producer_rts, producer_ap, gamma, delta, rho = r.get()
	print producer, '\t', follower, '\t', producer_rts, '\t',
	print "%.4f\t%.4f\t%.4f\t%.4f" % (producer_ap, gamma, delta, rho)

	if __name__ == "__main__":
	main()