Skip to content

Instantly share code, notes, and snippets.

@butlermh
Created November 29, 2013 10:41
Show Gist options
  • Save butlermh/7704077 to your computer and use it in GitHub Desktop.
Save butlermh/7704077 to your computer and use it in GitHub Desktop.
Python script to convert Apple forum data set http://sifaka.cs.uiuc.edu/~wang296/Data/index.html to GML so it can be imported into Gephi.
import glob
import os
import networkx as nx
from unidecode import unidecode
from dateutil.parser import parse
''' This file requires the networkx and unidecode packages to be installed e.g.
easy_install networkx
easy_install unidecode'''
'''This class converts the Apple Forums files to gml graphs'''
class ForumToGraph(object):
def __init__(self):
self.topic_graph = nx.Graph()
self.author_graph = nx.Graph()
self.author_to_topics = {}
self.topic_to_authors = {}
'''Reads in data from a file and stores it in the graphs containing topics and authors.
If two authors contributed to the same topic they are regarded as being connected.'''
def read_file(self, filename):
with open(filename) as f:
topic_id = ''
topic_title = ''
for line in f:
if line.startswith('<ID>'):
if topic_id == '':
topic_id = line[4:].strip()
self.topic_to_authors[topic_id] = set()
else:
self.topic_graph.node[topic_id]['number_of_posts'] += 1
if line.startswith('<Author>'):
# unfortunately gml does not support unicode
author_id = unidecode(line[8:].strip().replace(' ', '_'))
self.topic_to_authors[topic_id].add(author_id)
if author_id in self.author_graph:
author = self.author_graph.node[author_id]
self.author_graph.node[author_id]['number_of_posts'] += 1
self.author_to_topics[author_id].add(topic_id)
else:
self.author_graph.add_node(author_id, number_of_posts=1, oldest='', newest='')
author = self.author_graph.node[author_id]
self.author_to_topics[author_id] = set()
self.author_to_topics[author_id].add(topic_id)
if (line.startswith('<Title>') and topic_title == ''):
topic_title = unidecode(line[7:].strip())
print filename, topic_title
self.topic_graph.add_node(topic_id, title = topic_title, number_of_posts=1, oldest='', newest='')
if line.startswith('<Time>'):
post_time = line[6:].strip()
if (author['oldest'] == '' or author['oldest'] > post_time):
author['oldest'] = post_time
if (author['newest'] == '' or author['newest'] < post_time):
author['newest'] = post_time
topic = self.topic_graph.node[topic_id]
if (topic['oldest'] == '' or topic['oldest'] > post_time):
topic['oldest'] = post_time
if (topic['newest'] == '' or topic['newest'] < post_time):
topic['newest'] = post_time
for authorA in self.topic_to_authors[topic_id]:
for authorB in self.topic_to_authors[topic_id]:
if (not authorA == authorB):
self.author_graph.add_edge(authorA, authorB)
'''Calculate how long each author contributed to the forums and add edges between topics
that had contributions from the same author'''
def add_topic_links(self):
for topic_id in self.topic_to_authors:
self.topic_graph.node[topic_id]['length_of_posting'] = \
(parse(self.topic_graph.node[topic_id]['newest']) -
parse(self.topic_graph.node[topic_id]['oldest'])).days
for author_id in self.author_to_topics:
self.author_graph.node[author_id]['length_of_posting'] = \
(parse(self.author_graph.node[author_id]['newest']) -
parse(self.author_graph.node[author_id]['oldest'])).days
topics = self.author_to_topics[author_id]
for topicA in topics:
for topicB in topics:
if (not topicA == topicB):
self.topic_graph.add_edge(topicA, topicB)
'''Calculate some summary statistics'''
def print_statistics(self):
print "Author graph"
print "------------"
num_authors = len(self.author_to_topics)
print "Total number of authors", num_authors
print "Total number of links between authors", len(self.author_graph.edges())
author_no_contacts = 0
only_one_post = 0
sum_contacts = 0
sum_posts = 0
sum_membership_period = 0
for author_id in self.author_to_topics:
contacts = len(self.author_graph.neighbors(author_id))
if contacts == 0:
author_no_contacts += 1
sum_contacts += contacts
posts = self.author_graph.node[author_id]['number_of_posts']
if posts == 1:
only_one_post += 1
sum_posts += posts
membership_period = self.author_graph.node[author_id]['length_of_posting']
sum_membership_period += membership_period
percent = 100.00
author_no_contacts_percent = author_no_contacts * percent / num_authors
print "Total number of authors with no contacts", \
author_no_contacts, "=", author_no_contacts_percent, "%"
only_one_post_percent = only_one_post * percent / num_authors
print "Total number of authors who only posted once", \
only_one_post, "=", only_one_post_percent, "%"
print "Average number of contacts per author", sum_contacts / num_authors * 1.0
print "Average number of posts per author", sum_posts / num_authors * 1.0
print "Average length of author membership", sum_membership_period / num_authors, "days"
print
print "Topic graph"
print "-----------"
topic_num = len(self.topic_to_authors)
print "Total number of topics", topic_num
print "Total number of links between topics", len(self.topic_graph.edges())
sum_topic_lifetime = 0
for topic_id in self.topic_to_authors:
topic_lifetime = self.topic_graph.node[topic_id]['length_of_posting']
sum_topic_lifetime += topic_lifetime
print "Average topic lifetime", sum_topic_lifetime / topic_num, "days"
topic_only_one_author = 0
for topic_id in self.topic_to_authors:
if (len(self.topic_to_authors[topic_id]) == 1):
topic_only_one_author += 1
topic_only_one_author_percent = topic_only_one_author * percent / topic_num
print "Total number of topics with only one author", \
topic_only_one_author, "=", topic_only_one_author_percent, "%"
'''Store the graphs in GML format so they can be processed with Gephi'''
def save_graphs(self):
## make GML file containing only authors
nx.write_gml(self.author_graph, 'authors.gml')
## make GML file containing both authors and posts
nx.write_gml(self.topic_graph, 'topics.gml')
''' Read all the files in a number of directories under a main directory'''
def read_dir(self, file_path):
currentpath = os.path.dirname(os.path.realpath(__file__))
for directory in os.listdir(file_path):
os.chdir(os.path.join(file_path, directory))
for filename in glob.glob('*.html.token'):
self.read_file(filename)
os.chdir(currentpath)
f = ForumToGraph()
f.read_dir('apple')
f.add_topic_links()
f.print_statistics()
f.save_graphs()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment