Skip to content

Instantly share code, notes, and snippets.

Created June 2, 2010 18:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save anonymous/422761 to your computer and use it in GitHub Desktop.
Save anonymous/422761 to your computer and use it in GitHub Desktop.
# Target is to generate a graph file in gexf format (http://gexf.net/format) for Gephi
#-----------------------------------------------------
# STEP 1
# Generate nodes and edgelist from each email log file
#-----------------------------------------------------
setwd("C:/R")
# use sqldf for operations suited for db http://code.google.com/p/sqldf/
library(sqldf)
# define utility functions
object.sizes <- function(obs=ls(envir=.GlobalEnv)){return(rev(sort(sapply(obs, function (object.name) object.size(get(object.name))))))}
# Create an empty data frame from a header list
empty.df<- function(header){
df<-data.frame(matrix(matrix(rep(1,length(header)),1),1))
colnames(df)<-header
return(df[NULL,])
}
# break down large data problem into smaller ones
max_rows<-200000
filelist<-c("Emails Dec 2009.txt", "Emails Jan 2010.txt", "Emails Feb 2010.txt")
# data format in the email logs is as below
# date from_address from_name to_address to_name
# 12-12-2009 john_doe@gmail.com John Doe jane_smith@hotmail.com Jane Smith
for(k in 1:length(filelist)){
system.time(raw_emails<-read.csv(filelist[k],sep="\t",header=T,strip.white=TRUE))
# remove system-generated emails
emails<-raw_emails[-grep('donotreply|error|webmaster|www', paste(raw_emails[,3],raw_emails[,5]), ignore.case=TRUE, value=FALSE),]
# filenames for collecting nodes and edges
node_file<-paste("nodes",k,"-",filelist[k],sep="")
edge_file<-paste("edges",k,"-",filelist[k],sep="")
# get to_nodes in "node_id, node_label" format
to_nodes<-emails[c(-1,-2,-3)]
# get from_nodes in "node_id, node_label" format
from_nodes<-emails[c(-1,-4,-5)]
# get edgelist in "from_node_id, to_node_id" format
edgelist<-emails[c(-1,-3,-5)]
# change column names for rbind
colnames(to_nodes)<-c("id","name")
colnames(from_nodes)<-c("id","name")
all_nodes<-rbind(to_nodes,from_nodes)
# convert all nodes and edgelist to lowercase... using SQL
system.time(all_nodes_lowercase<-sqldf('SELECT LOWER(id) uid, LOWER(name) label FROM all_nodes'))
system.time(edgelist_lowercase<-sqldf('SELECT LOWER(Originator) originator, LOWER(Recipient) recipient FROM edgelist'))
unique_nodes<-unique(all_nodes_lowercase)
sorted_unique_nodes<- unique_nodes[order(unique_nodes[,1]),]
write.csv(sorted_unique_nodes, file = node_file, row.names=FALSE, quote = FALSE)
num_blocks<-ceiling(nrow(edgelist_lowercase)/max_rows)
start_row<-0
edgecount <- empty.df(c("originator","recipient","count(1)"))
for(i in 1:num_blocks){
sql_statement<-paste('select originator, recipient, count(1) FROM (select originator, recipient FROM edgelist_lowercase LIMIT ', start_row, ',', max_rows, ') group by originator, recipient order by originator, recipient')
print(system.time(counts<-sqldf(sql_statement)))
edgecount<- rbind(edgecount, counts)
start_row<-start_row + max_rows
}
system.time(sqldf("create index edgecount1 on edgecount (originator, recipient)"))
system.time(final_edgecount <- sqldf("select originator, recipient, sum(count_1_) FROM edgecount group by originator, recipient order by originator, recipient"))
write.csv(final_edgecount, file = edge_file, row.names=FALSE, quote = FALSE)
}
#----------------------------------------------------------------------
# STEP 2
# Combine node and edgelist files into one large node and edgelist file
#----------------------------------------------------------------------
all_file_nodes <- empty.df(c("id","label"))
all_file_edges <- empty.df(c("originator","recipient", "sum.count_1_."))
for(k in 1:length(filelist)){
node_file<-paste("nodes",k,"-",filelist[k],sep="")
edge_file<-paste("edges",k,"-",filelist[k],sep="")
# read each node file and rbind with all_file_nodes
system.time(nodes<-read.csv(node_file,sep=",",header=T,strip.white=TRUE))
all_file_nodes <- rbind(all_file_nodes, nodes)
# read each edge file and rbind with all_file_edges
system.time(edges<-read.csv(edge_file,sep=",",header=T,strip.white=TRUE))
all_file_edges <- rbind(all_file_edges, edges)
}
unique_all_file_nodes<-unique(all_file_nodes)
sorted_unique_all_file_nodes<- unique_all_file_nodes[order(unique_all_file_nodes[,1]),]
# write nodes in this form --- <node id="0" label="Hello" />
nodexml<-paste("<node id=\"", sorted_unique_all_file_nodes[,1], "\"", " label=\"", sorted_unique_all_file_nodes[,2],"\""," />", sep="")
write.csv(as.data.frame(nodexml, optional=TRUE), file = "All Nodes.txt", quote = FALSE, row.names = FALSE)
# edge operations
# use pragma table_info to see the table attributes to use in sum sql below
# sqldf("pragma table_info(all_file_edges)")
unique_all_file_edges<-sqldf('select originator, recipient, sum(sum_count_1__) FROM all_file_edges group by originator, recipient order by sum(sum_count_1__)')
nrow(unique_all_file_edges)
# filter out edges with wt 0 to 2
thicker_edges<-unique_all_file_edges[unique_all_file_edges[,3]>3,]
# write edges in this form --- <edge id="0" source="0" target="1" type="directed" weight="2.4" />
edgelistxml<-paste("<edge id=\"", rownames(thicker_edges), "\" ", "source=\"", thicker_edges[,1], "\" target=\"", thicker_edges[,2], "\" weight=\"", thicker_edges[,3], "\"/>", sep="")
# write edges for gexf file. Convert to data.frame to prevent printing the column name
write.csv(as.data.frame(edgelistxml, optional=TRUE), file = "All Edges.txt", quote = FALSE, row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment