Skip to content

Instantly share code, notes, and snippets.

Created December 7, 2012 13:25
Show Gist options
  • Select an option

  • Save anonymous/4233271 to your computer and use it in GitHub Desktop.

Select an option

Save anonymous/4233271 to your computer and use it in GitHub Desktop.
Goes over Facebook groups and creates graphs of who invited who.
"""
!!!TODO: Duplicate names will screw your algorithm. Unfortunately, there is no easy
way to robustly deal with it. For now I will assume that names are not duplicates.
!!! Phooey! Facebook keeps "who added whom" only for a year.
"""
import pydot
import operator
import re
import random
import glob
import matplotlib.pyplot as plt
NO_JOIN_REASON = "NO JOIN REASON FOUND!!!"
ADDED_BY = "Added by "
INVITED_BY = "Invited by "
SELF_INVITED = "SELF^%$@" # Assume that no person in the group is named this way.
def get_joined_by_from_person_html(person_html):
# Was added?
for add_prefix in [ADDED_BY, INVITED_BY]:
x = re.findall(add_prefix + ".*?<", person_html)
if x:
return x[0][len(add_prefix):-2]
# Was alone?
x = re.findall("Joined.*?<", person_html)
if x:
return SELF_INVITED
# Unidentified
raise Exception(NO_JOIN_REASON)
def get_name_from_person_html(person_html):
x = re.findall("<a href.*?>", person_html)[0] # Bound to have it
person_html = person_html.replace(x,"")
return person_html[:person_html.find("</a>")]
def extract_data_from_person_html(person_html):
name = get_name_from_person_html(person_html)
joined_by = get_joined_by_from_person_html(person_html)
return (name, joined_by)
class Tree(object):
def __init__(self, name):
self.name = name
self.children = []
def add_new_person(self, name, joined_by):
"""
Adds a new person to the tree. You assume that this is the first time this method
has been invoked with "name".
(Note: however, cannot assume that "name" is not already in the tree;
maybe it was created because he invited someone. The program deals
with this, it's ok.
"""
if joined_by == SELF_INVITED:
# The person joined by himeself. See if he is already in the list;
# if not, add him at top level.
if not self.is_in_tree(name):
self.children.append(Tree(name))
else:
if not self.is_in_tree(joined_by):
# Create first level.
self.children.append(Tree(joined_by))
if not self.is_in_tree(name):
# The person is not in the forest. Just find his joined_by, and
# add a new node.
inviter_node = self.find(joined_by)
inviter_node.children.append(Tree(name))
else:
# This person was already inserted in the past, because he added
# someone else. Just need to move his tree. He should be in
# the top row; no need to look in children.
inviter_node = self.find(joined_by)
for i in xrange(len(self.children)):
child = self.children[i]
if child.name == name:
invitee_node = child
self.children.remove(child)
break
inviter_node.children.append(invitee_node)
pass
def is_in_tree(self, name):
if self.name == name:
return True
for c in self.children:
if c.is_in_tree(name):
return True
return False
def find(self, name):
if self.name == name:
return self
for c in self.children:
x = c.find(name)
if x:
return x
return None
def print_tree(self, indentation_level=""):
print indentation_level + self.name
for c in self.children:
c.print_tree(indentation_level+"\t")
def get_tree_by_invitees(self,depth=0):
# Explaining "to_add" and depth: we call this using the highest level
# tree. We want that every person who added themselves to the group by
# their own to have +1 to the number of people added. This corresponds to
# depth=1, if you start at depth=0 and the group name at the top.
to_add = 1 if depth == 1 else 0
x = [(self.name, len(self.children)+to_add)]
return_list = []
for c in self.children:
return_list+=c.get_tree_by_invitees(depth=depth+1)
return x + return_list
def add_to_graph(self, graph):
"""
For graphics and graph purposes.
"""
for c in self.children:
edge = pydot.Edge(self.name, c.name)
graph.add_edge(edge)
c.add_to_graph(graph)
def get_invite_tree_from_html(group_name, full_html):
people_html = re.findall("""\<a href.*?abbr title=""", full_html)
tree = Tree(group_name)
for person_html in people_html:
name, joined_by = extract_data_from_person_html(person_html)
tree.add_new_person(name, joined_by)
return tree
all_group_filenames = glob.glob("*.grp")
all_group_statistics = []
for filename in all_group_filenames:
with open(filename) as f:
x = f.read()
l = [a for a in x.splitlines() if "groupsSkyContentWideHeader" in a][0]
group_name = filename[:-4]
tree = get_invite_tree_from_html(group_name, l)
tree.print_tree()
how_many_invited = tree.get_tree_by_invitees()
how_many_invited = [x for x in how_many_invited if x[0]!=group_name]
people_in_group = sum([x[1] for x in how_many_invited])
invited_by_percent = how_many_invited
invited_by_percent = [ (x[0], float(x[1]) / people_in_group) for x in how_many_invited]
all_group_statistics += invited_by_percent
# You can filter out all those zeros if you want
all_group_statistics = [x for x in all_group_statistics if x[1]>0]
all_group_statistics.sort(key=operator.itemgetter(1), reverse=True)
x, y = zip(*[(x[0], x[1][1]) for x in zip(xrange(1, len(all_group_statistics)+1), all_group_statistics)])
# TODO: replace this with scipy
# Calculating the area under the curve...
total_area = 0
for i in xrange(len(y)-1):
total_area+= (y[i] + y[i+1])/2 # Trapezoid rule
print "The total area under the curve is: %s " % (total_area,)
print "The total number of people is %s " % (len(x),)
s = 0
i = 0
PERCENTAGE = 0.5
while s <= total_area * PERCENTAGE:
s+= (y[i] + y[i+1])/2 # Trapezoid rule
i+=1
print "%s of this is covered by %s %% of the people " % (PERCENTAGE, 100 * float(i) / len(x),)
print "Numerically, that's %s " % (i,)
print "(%s is %s, yes?) " % (PERCENTAGE, s,)
# Write to file:
with open("results.txt", "w") as f:
for (a,b) in zip(x,y):
f.write("%d\t%f\n" % (a,b))
fig = plt.figure()
ax = fig.add_subplot(121)
ax.plot(x, y, label = "")
ax.set_xscale("log")
ax.set_yscale("log")
plt.xlabel("Rank")
plt.ylabel("Percent of group invited")
ax = fig.add_subplot(122)
ax.plot(x, y, label = "")
plt.xlabel("Rank")
plt.ylabel("Percent of group invited")
plt.show()
### Do something like this if you want to draw a certain graph:
#~ graph = pydot.Dot()
#~ tree.children[0].add_to_graph(graph)
#~ graph.write_png('zzz.png')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment