emaadmanzoor/QuantifyingMonotonyAversion.md

## calculate_cluster_statistics.py
#!/usr/bin/env python

MAX_CLUSTER_SIZE = 5
MAX_POSITION = 4

from collections import defaultdict
import cPickle as pickle
from itertools import groupby

def update_cluster_stats(user, followees, tweets,
                         cluster_reactions,
                         position_counts, position_reactions):

    # reconstruct user's timeline
    timeline = sorted([tweet for tweet in tweets
                       if tweet[0] in followees],
                      reverse=True, key=lambda x: x[1])

    # empty timeline
    if len(timeline) == 0:
        return cluster_reactions, position_counts, position_reactions

    # ids of all tweets reacted to by user
    reactions = [tweet[5] for tweet in tweets           # retweets
                 if tweet[0] == user and tweet[5] > 0] + \
                [tweet[6] for tweet in tweets           # replies
                 if tweet[0] == user and tweet[6] > 0]
    reactions = set(reactions)

    # calculate cluster statistics
    current_cluster_size = 0
    current_cluster_reactions = []
    current_cluster_user = timeline[0][0]
    for tweet in timeline:
        if tweet[0] != current_cluster_user: # new cluster
            cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]

            for pos, is_reaction in enumerate(current_cluster_reactions):
                position_counts[(pos, current_cluster_size)] += 1

                if is_reaction:
                    position_reactions[(pos, current_cluster_size)] += 1

            current_cluster_size = 1
            current_cluster_user = tweet[0]
            current_cluster_reactions = []

        elif tweet[0] == current_cluster_user: # in the same cluster
            current_cluster_size += 1

        if tweet[4] in reactions:
            current_cluster_reactions += [1]
        else:
            current_cluster_reactions += [0]

    # last cluster
    cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]

    for pos, is_reaction in enumerate(current_cluster_reactions):
        position_counts[(pos, current_cluster_size)] += 1

        if is_reaction:
            position_reactions[(pos, current_cluster_size)] += 1

    return cluster_reactions, position_counts, position_reactions

if __name__ == "__main__":
    tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
    links = sorted(pickle.load(open('all_links.p', 'rb')),
                   key=lambda x: x[1])

    cluster_reactions = defaultdict(list)
    position_counts = defaultdict(int)
    position_reactions = defaultdict(int)
    for user, followee_links in groupby(links, key=lambda x: x[1]):
        followees = set([link[0] for link in followee_links])

        cluster_reactions, position_counts,\
        position_reactions = update_cluster_stats(user, followees, tweets,
                                                  cluster_reactions,
                                                  position_counts, position_reactions)

    for cluster_size in xrange(1, MAX_CLUSTER_SIZE+1):
        # statistics for clusters of this size
        reaction_counts = cluster_reactions[cluster_size]
        number_of_clusters = len(reaction_counts)
        number_of_nonempty_clusters = sum([1 for count in reaction_counts
                                           if count > 0])
        number_of_tweets = cluster_size * number_of_clusters
        number_of_reactions = sum(reaction_counts)

        print '%d\t%d\t%d\t%d\t%d' % (cluster_size, number_of_reactions,\
                                      number_of_tweets, number_of_clusters,\
                                      number_of_nonempty_clusters)

## QuantifyingMonotonyAversion.md

      
    Raw
  

              QuantifyingMonotonyAversion.md
            
          
    See the project website for more details.
Please report any issues to emaadahmed.manzoor@kaust.edu.sa.
Execution

Running this requires having the following files in the same directory as calculate_cluster_statistics.py:

all_links.p
all_tweets.p

It is run by a simple Python call: python calculate_cluster_statistics.py
Output:
1	15895 8437262 8437262	15895
2	2756  1818212 909106	2582
3	715 586551  195517  623
4	301 243848  60962 249
5	126 126250  25250 101

The output is tab-separated, with the columns as follows:
cluster_size | number_of_reactions | number_of_tweets | number_of_clusters | number_of_nonempty_clusters

A non-empty cluster is one in which at least one tweet has been reacted to.
	#!/usr/bin/env python

	MAX_CLUSTER_SIZE = 5
	MAX_POSITION = 4

	from collections import defaultdict
	import cPickle as pickle
	from itertools import groupby

	def update_cluster_stats(user, followees, tweets,
	cluster_reactions,
	position_counts, position_reactions):

	# reconstruct user's timeline
	timeline = sorted([tweet for tweet in tweets
	if tweet[0] in followees],
	reverse=True, key=lambda x: x[1])

	# empty timeline
	if len(timeline) == 0:
	return cluster_reactions, position_counts, position_reactions

	# ids of all tweets reacted to by user
	reactions = [tweet[5] for tweet in tweets # retweets
	if tweet[0] == user and tweet[5] > 0] + \
	[tweet[6] for tweet in tweets # replies
	if tweet[0] == user and tweet[6] > 0]
	reactions = set(reactions)

	# calculate cluster statistics
	current_cluster_size = 0
	current_cluster_reactions = []
	current_cluster_user = timeline[0][0]
	for tweet in timeline:
	if tweet[0] != current_cluster_user: # new cluster
	cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]

	for pos, is_reaction in enumerate(current_cluster_reactions):
	position_counts[(pos, current_cluster_size)] += 1

	if is_reaction:
	position_reactions[(pos, current_cluster_size)] += 1

	current_cluster_size = 1
	current_cluster_user = tweet[0]
	current_cluster_reactions = []

	elif tweet[0] == current_cluster_user: # in the same cluster
	current_cluster_size += 1

	if tweet[4] in reactions:
	current_cluster_reactions += [1]
	else:
	current_cluster_reactions += [0]

	# last cluster
	cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]

	for pos, is_reaction in enumerate(current_cluster_reactions):
	position_counts[(pos, current_cluster_size)] += 1

	if is_reaction:
	position_reactions[(pos, current_cluster_size)] += 1

	return cluster_reactions, position_counts, position_reactions

	if __name__ == "__main__":
	tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
	links = sorted(pickle.load(open('all_links.p', 'rb')),
	key=lambda x: x[1])

	cluster_reactions = defaultdict(list)
	position_counts = defaultdict(int)
	position_reactions = defaultdict(int)
	for user, followee_links in groupby(links, key=lambda x: x[1]):
	followees = set([link[0] for link in followee_links])

	cluster_reactions, position_counts,\
	position_reactions = update_cluster_stats(user, followees, tweets,
	cluster_reactions,
	position_counts, position_reactions)

	for cluster_size in xrange(1, MAX_CLUSTER_SIZE+1):
	# statistics for clusters of this size
	reaction_counts = cluster_reactions[cluster_size]
	number_of_clusters = len(reaction_counts)
	number_of_nonempty_clusters = sum([1 for count in reaction_counts
	if count > 0])
	number_of_tweets = cluster_size * number_of_clusters
	number_of_reactions = sum(reaction_counts)

	print '%d\t%d\t%d\t%d\t%d' % (cluster_size, number_of_reactions,\
	number_of_tweets, number_of_clusters,\
	number_of_nonempty_clusters)