{{ message }}

Instantly share code, notes, and snippets.

Last active Aug 29, 2015
Quantifying Monotony Aversion
 #!/usr/bin/env python MAX_CLUSTER_SIZE = 5 MAX_POSITION = 4 from collections import defaultdict import cPickle as pickle from itertools import groupby def update_cluster_stats(user, followees, tweets, cluster_reactions, position_counts, position_reactions): # reconstruct user's timeline timeline = sorted([tweet for tweet in tweets if tweet[0] in followees], reverse=True, key=lambda x: x[1]) # empty timeline if len(timeline) == 0: return cluster_reactions, position_counts, position_reactions # ids of all tweets reacted to by user reactions = [tweet[5] for tweet in tweets # retweets if tweet[0] == user and tweet[5] > 0] + \ [tweet[6] for tweet in tweets # replies if tweet[0] == user and tweet[6] > 0] reactions = set(reactions) # calculate cluster statistics current_cluster_size = 0 current_cluster_reactions = [] current_cluster_user = timeline[0][0] for tweet in timeline: if tweet[0] != current_cluster_user: # new cluster cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)] for pos, is_reaction in enumerate(current_cluster_reactions): position_counts[(pos, current_cluster_size)] += 1 if is_reaction: position_reactions[(pos, current_cluster_size)] += 1 current_cluster_size = 1 current_cluster_user = tweet[0] current_cluster_reactions = [] elif tweet[0] == current_cluster_user: # in the same cluster current_cluster_size += 1 if tweet[4] in reactions: current_cluster_reactions += [1] else: current_cluster_reactions += [0] # last cluster cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)] for pos, is_reaction in enumerate(current_cluster_reactions): position_counts[(pos, current_cluster_size)] += 1 if is_reaction: position_reactions[(pos, current_cluster_size)] += 1 return cluster_reactions, position_counts, position_reactions if __name__ == "__main__": tweets = sorted(pickle.load(open('all_tweets.p', 'rb'))) links = sorted(pickle.load(open('all_links.p', 'rb')), key=lambda x: x[1]) cluster_reactions = defaultdict(list) position_counts = defaultdict(int) position_reactions = defaultdict(int) for user, followee_links in groupby(links, key=lambda x: x[1]): followees = set([link[0] for link in followee_links]) cluster_reactions, position_counts,\ position_reactions = update_cluster_stats(user, followees, tweets, cluster_reactions, position_counts, position_reactions) for cluster_size in xrange(1, MAX_CLUSTER_SIZE+1): # statistics for clusters of this size reaction_counts = cluster_reactions[cluster_size] number_of_clusters = len(reaction_counts) number_of_nonempty_clusters = sum([1 for count in reaction_counts if count > 0]) number_of_tweets = cluster_size * number_of_clusters number_of_reactions = sum(reaction_counts) print '%d\t%d\t%d\t%d\t%d' % (cluster_size, number_of_reactions,\ number_of_tweets, number_of_clusters,\ number_of_nonempty_clusters)

See the project website for more details.

## Execution

Running this requires having the following files in the same directory as `calculate_cluster_statistics.py`:

• `all_links.p`
• `all_tweets.p`

It is run by a simple Python call: `python calculate_cluster_statistics.py`

Output:

``````1	15895 8437262 8437262	15895
2	2756  1818212 909106	2582
3	715 586551  195517  623
4	301 243848  60962 249
5	126 126250  25250 101
``````

The output is tab-separated, with the columns as follows:

``````cluster_size | number_of_reactions | number_of_tweets | number_of_clusters | number_of_nonempty_clusters
``````

A non-empty cluster is one in which at least one tweet has been reacted to.