Skip to content

Instantly share code, notes, and snippets.

@emaadmanzoor
Last active August 29, 2015 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emaadmanzoor/a1e6632f905fa6bcbbcb to your computer and use it in GitHub Desktop.
Save emaadmanzoor/a1e6632f905fa6bcbbcb to your computer and use it in GitHub Desktop.
Attention Potential Validation Code

See the project website for more details.

Please report any issues to emaadahmed.manzoor@kaust.edu.sa.

Correlation Results

The attention potential (as estimated in section 4), when evaluated on this Twitter dataset:

  • Is 73.61% correlated with the retweets obtained.
  • Is significantly correlated (p < 0.05).
  • Is correlated with a 95% confidence interval of [72.72%, 74.48%].

Execution

Running this requires having the following files in the same directory as validate_ap.py:

  • all_links.p
  • all_tweets.p

It is run by a simple Python call: python ap_validation.py

The output is tab-separated, with the columns as follows:

producer_id | follower_id | number_of_reactions | attention_potential | gamma | delta | rho

The output can then be processed to evaluate the correlation between the number of reactions and attention potential. The following code does this in R:

mydata <- read.csv('output.txt', sep='\t')
names(mydata) <- c('pid', 'fid', 'nreactions', 'ap', 'gamma', 'delta', 'rho')
nonzero <- subset(mydata, nreactions > 0 | ap > 0)
cor.test(nonzero$nreactions, nonzero$ap)

Output:

	Pearson's product-moment correlation

data:  nonzero$nreactions and nonzero$ap
t = 111.3336, df = 10478, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.7272485 0.7447932
sample estimates:
      cor 
0.7361445 
#!/usr/bin/env python
import cPickle as pickle
from itertools import groupby
from multiprocessing import Pool
LOGIN_THRESHOLD = 8 # hours
def calculate_ap(producer, producer_tweets,
follower, tweets, links):
# all the follower's tweets
follower_tweets = [tweet for tweet in tweets
if tweet[0] == follower]
if len(follower_tweets) == 0:
return 0, 0.0, 0, 0, 0
# all the competitors' tweets
competitors = set([link[1] for link in links if link[0] == follower])
competitor_tweets = [tweet for tweet in tweets
if tweet[0] in competitors]
# estimate gamma: prior prob of follower reaction
# this is a user-define weight
follower_reactions = [tweet for tweet in follower_tweets
if tweet[2] > 0 or tweet[3] > 0]
gamma = len(follower_reactions) / float(len(follower_tweets))
# estimate delta: prob of producer's tweet getting a follower reaction
# this is the usual measure of (directed) tie strength
follower_reactions_to_producer = [tweet for tweet in follower_reactions
if tweet[2] == producer or tweet[3] == producer]
delta = len(follower_reactions_to_producer) / float(len(producer_tweets))
if gamma == 0 or delta == 0:
return len(follower_reactions_to_producer), 0.0, 0, 0, 0
# estimate rho: prob of leaving the timeline after reading a post
#
# rho is the parameter of a geometric distribution
# it is estimated using the average number of posts
# consumed per login
#
# a tweet is "consumed" if it is reacted to
# a "login" is counted for a followers tweet
# occuring after a gap of 8 hours (§2.2)
num_logins = 0
prev = follower_tweets[0][1] / 1000
for follower_tweet in follower_tweets[1:]:
diff = (follower_tweet[1] / 1000 - prev) / 3600. # hours
if diff >= LOGIN_THRESHOLD:
num_logins += 1
prev = follower_tweet[1] / 1000
mu = len(follower_reactions) / float(num_logins)
rho = 1. / (1 + mu)
# reconstruct timeline
timeline = []
timeline += [(t[1]/1000, 2) for t in follower_tweets]
timeline += [(t[1]/1000, 1) for t in competitor_tweets]
timeline += [(t[1]/1000, 0) for t in producer_tweets]
timeline = sorted(timeline, reverse=True)
# calculate ap: geometric user/cluster survival fns
ap = 0.0
depth = 0
cluster_size = 0
cluster_ap = 0
for post in timeline:
if post[1] == 2: # follower
depth = 0
continue
depth += 1
if post[1] == 1: # competitor
ap += (delta ** cluster_size) * cluster_ap
cluster_size = 0
cluster_ap = 0
continue
if post[1] == 0: # producer
cluster_size += 1
cluster_ap += (1 - rho) ** depth
continue
# toggle comment for with and without gamma
ap *= gamma
return len(follower_reactions_to_producer), ap, gamma, delta, rho
def main():
tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
links = sorted(pickle.load(open('all_links.p', 'rb')))
# multiprocessing
pool = Pool(8)
results = [] # (producer, follower, #rts, #ap)
for producer, producer_links in groupby(links, key=lambda x: x[0]):
producer_tweets = [tweet for tweet in tweets
if tweet[0] == producer]
if len(producer_tweets) == 0:
continue
# calculate the total attention potential of the producer
for producer_link in producer_links:
follower = producer_link[1]
results.append((producer, follower,
pool.apply_async(calculate_ap,
(producer, producer_tweets,
follower, tweets, links))))
for producer, follower, r in results:
producer_rts, producer_ap, gamma, delta, rho = r.get()
print producer, '\t', follower, '\t', producer_rts, '\t',
print "%.4f\t%.4f\t%.4f\t%.4f" % (producer_ap, gamma, delta, rho)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment