Created
December 7, 2012 13:25
-
-
Save anonymous/4233271 to your computer and use it in GitHub Desktop.
Goes over Facebook groups and creates graphs of who invited who.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| !!!TODO: Duplicate names will screw your algorithm. Unfortunately, there is no easy | |
| way to robustly deal with it. For now I will assume that names are not duplicates. | |
| !!! Phooey! Facebook keeps "who added whom" only for a year. | |
| """ | |
| import pydot | |
| import operator | |
| import re | |
| import random | |
| import glob | |
| import matplotlib.pyplot as plt | |
| NO_JOIN_REASON = "NO JOIN REASON FOUND!!!" | |
| ADDED_BY = "Added by " | |
| INVITED_BY = "Invited by " | |
| SELF_INVITED = "SELF^%$@" # Assume that no person in the group is named this way. | |
| def get_joined_by_from_person_html(person_html): | |
| # Was added? | |
| for add_prefix in [ADDED_BY, INVITED_BY]: | |
| x = re.findall(add_prefix + ".*?<", person_html) | |
| if x: | |
| return x[0][len(add_prefix):-2] | |
| # Was alone? | |
| x = re.findall("Joined.*?<", person_html) | |
| if x: | |
| return SELF_INVITED | |
| # Unidentified | |
| raise Exception(NO_JOIN_REASON) | |
| def get_name_from_person_html(person_html): | |
| x = re.findall("<a href.*?>", person_html)[0] # Bound to have it | |
| person_html = person_html.replace(x,"") | |
| return person_html[:person_html.find("</a>")] | |
| def extract_data_from_person_html(person_html): | |
| name = get_name_from_person_html(person_html) | |
| joined_by = get_joined_by_from_person_html(person_html) | |
| return (name, joined_by) | |
| class Tree(object): | |
| def __init__(self, name): | |
| self.name = name | |
| self.children = [] | |
| def add_new_person(self, name, joined_by): | |
| """ | |
| Adds a new person to the tree. You assume that this is the first time this method | |
| has been invoked with "name". | |
| (Note: however, cannot assume that "name" is not already in the tree; | |
| maybe it was created because he invited someone. The program deals | |
| with this, it's ok. | |
| """ | |
| if joined_by == SELF_INVITED: | |
| # The person joined by himeself. See if he is already in the list; | |
| # if not, add him at top level. | |
| if not self.is_in_tree(name): | |
| self.children.append(Tree(name)) | |
| else: | |
| if not self.is_in_tree(joined_by): | |
| # Create first level. | |
| self.children.append(Tree(joined_by)) | |
| if not self.is_in_tree(name): | |
| # The person is not in the forest. Just find his joined_by, and | |
| # add a new node. | |
| inviter_node = self.find(joined_by) | |
| inviter_node.children.append(Tree(name)) | |
| else: | |
| # This person was already inserted in the past, because he added | |
| # someone else. Just need to move his tree. He should be in | |
| # the top row; no need to look in children. | |
| inviter_node = self.find(joined_by) | |
| for i in xrange(len(self.children)): | |
| child = self.children[i] | |
| if child.name == name: | |
| invitee_node = child | |
| self.children.remove(child) | |
| break | |
| inviter_node.children.append(invitee_node) | |
| pass | |
| def is_in_tree(self, name): | |
| if self.name == name: | |
| return True | |
| for c in self.children: | |
| if c.is_in_tree(name): | |
| return True | |
| return False | |
| def find(self, name): | |
| if self.name == name: | |
| return self | |
| for c in self.children: | |
| x = c.find(name) | |
| if x: | |
| return x | |
| return None | |
| def print_tree(self, indentation_level=""): | |
| print indentation_level + self.name | |
| for c in self.children: | |
| c.print_tree(indentation_level+"\t") | |
| def get_tree_by_invitees(self,depth=0): | |
| # Explaining "to_add" and depth: we call this using the highest level | |
| # tree. We want that every person who added themselves to the group by | |
| # their own to have +1 to the number of people added. This corresponds to | |
| # depth=1, if you start at depth=0 and the group name at the top. | |
| to_add = 1 if depth == 1 else 0 | |
| x = [(self.name, len(self.children)+to_add)] | |
| return_list = [] | |
| for c in self.children: | |
| return_list+=c.get_tree_by_invitees(depth=depth+1) | |
| return x + return_list | |
| def add_to_graph(self, graph): | |
| """ | |
| For graphics and graph purposes. | |
| """ | |
| for c in self.children: | |
| edge = pydot.Edge(self.name, c.name) | |
| graph.add_edge(edge) | |
| c.add_to_graph(graph) | |
| def get_invite_tree_from_html(group_name, full_html): | |
| people_html = re.findall("""\<a href.*?abbr title=""", full_html) | |
| tree = Tree(group_name) | |
| for person_html in people_html: | |
| name, joined_by = extract_data_from_person_html(person_html) | |
| tree.add_new_person(name, joined_by) | |
| return tree | |
| all_group_filenames = glob.glob("*.grp") | |
| all_group_statistics = [] | |
| for filename in all_group_filenames: | |
| with open(filename) as f: | |
| x = f.read() | |
| l = [a for a in x.splitlines() if "groupsSkyContentWideHeader" in a][0] | |
| group_name = filename[:-4] | |
| tree = get_invite_tree_from_html(group_name, l) | |
| tree.print_tree() | |
| how_many_invited = tree.get_tree_by_invitees() | |
| how_many_invited = [x for x in how_many_invited if x[0]!=group_name] | |
| people_in_group = sum([x[1] for x in how_many_invited]) | |
| invited_by_percent = how_many_invited | |
| invited_by_percent = [ (x[0], float(x[1]) / people_in_group) for x in how_many_invited] | |
| all_group_statistics += invited_by_percent | |
| # You can filter out all those zeros if you want | |
| all_group_statistics = [x for x in all_group_statistics if x[1]>0] | |
| all_group_statistics.sort(key=operator.itemgetter(1), reverse=True) | |
| x, y = zip(*[(x[0], x[1][1]) for x in zip(xrange(1, len(all_group_statistics)+1), all_group_statistics)]) | |
| # TODO: replace this with scipy | |
| # Calculating the area under the curve... | |
| total_area = 0 | |
| for i in xrange(len(y)-1): | |
| total_area+= (y[i] + y[i+1])/2 # Trapezoid rule | |
| print "The total area under the curve is: %s " % (total_area,) | |
| print "The total number of people is %s " % (len(x),) | |
| s = 0 | |
| i = 0 | |
| PERCENTAGE = 0.5 | |
| while s <= total_area * PERCENTAGE: | |
| s+= (y[i] + y[i+1])/2 # Trapezoid rule | |
| i+=1 | |
| print "%s of this is covered by %s %% of the people " % (PERCENTAGE, 100 * float(i) / len(x),) | |
| print "Numerically, that's %s " % (i,) | |
| print "(%s is %s, yes?) " % (PERCENTAGE, s,) | |
| # Write to file: | |
| with open("results.txt", "w") as f: | |
| for (a,b) in zip(x,y): | |
| f.write("%d\t%f\n" % (a,b)) | |
| fig = plt.figure() | |
| ax = fig.add_subplot(121) | |
| ax.plot(x, y, label = "") | |
| ax.set_xscale("log") | |
| ax.set_yscale("log") | |
| plt.xlabel("Rank") | |
| plt.ylabel("Percent of group invited") | |
| ax = fig.add_subplot(122) | |
| ax.plot(x, y, label = "") | |
| plt.xlabel("Rank") | |
| plt.ylabel("Percent of group invited") | |
| plt.show() | |
| ### Do something like this if you want to draw a certain graph: | |
| #~ graph = pydot.Dot() | |
| #~ tree.children[0].add_to_graph(graph) | |
| #~ graph.write_png('zzz.png') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment