Skip to content

Instantly share code, notes, and snippets.

@jdesilvio
Created June 5, 2015 23:55
Show Gist options
  • Save jdesilvio/96b67d542eea8e41d688 to your computer and use it in GitHub Desktop.
Save jdesilvio/96b67d542eea8e41d688 to your computer and use it in GitHub Desktop.
This script is designed to create a JSON formatted for the D3 Sankey Diagram
##################################################################################################
### This script is designed to create a JSON formatted for the D3 Sankey Diagram. The JSON ###
### consists of nodes and links. The script can take in 2 or more categorical variables and ###
### and 1 numerical value variable. ###
##################################################################################################
# import libraries
library(reshape2)
library(plyr)
library(rjson)
##################################################################################################
### FUNCTIONS ####################################################################################
##################################################################################################
# create nodes for each node name (single path)
createNodes = function(srceList, targetList, nodeStart=0, ...) {
df = cbind(targetList, srceList)
names(df) = c("target", "srce")
nodesDF = as.data.frame(unique(c(list(as.character(df$srce))[[1]], list(as.character(df$target))[[1]])))
names(nodesDF)[1] = "name"
nodesDF$node = seq(nodeStart, nodeStart+length(nodesDF$name)-1)
nodesDF
}
# create nodes for each node name (multiple paths)
createNodesMulti = function(srceList, nodeStart=0, ...) {
nodesDF = as.data.frame(unique(c(list(as.character(srceList))[[1]])))
names(nodesDF)[1] = "name"
nodesDF$node = seq(nodeStart, nodeStart+length(nodesDF$name)-1)
nodesDF
}
# create a data frame consisting of sources, targets and values
sourceTargetValue = function (df, nodesDF) {
stv = df
srces = as.data.frame(stv$srce); names(srces) = "name"
targets = as.data.frame(stv$target); names(targets) = "name"
#add node numbers to df
srce_node = join(srces, nodesDF)
target_node = join(targets, nodesDF)
stv$srceNode = srce_node$node
stv$targetNode = target_node$node
stv
}
# format data into a JSON
createSankeyJSON = function(nodesDF, sourceTargetValue) {
#create seperate node and link JSONs
nodesList = sprintf('\n\t\t{"node":%s, "name":"%s"}',
nodesDF$node, nodesDF$name)
linksList = sprintf('\n\t\t{"source":%s, "target":%s, "value":%s}',
sourceTargetValue$srceNode, sourceTargetValue$targetNode, sourceTargetValue$value)
#combine into a sankey JSON
sankeyJSON = sprintf('{\n\t"nodes": [%s\n\t],\n\t"links": [%s\n\t]\n}',
toString(nodesList), toString(linksList))
sankeyJSON
}
# create sankey data for 2 categorical variables (1 path)
createSankeyDataSingle = function (df, sourceTargetList, value) {
keep = c(sourceTargetList, value)
dfKeep = df[keep]
names(dfKeep) = c("srce", "target", "value")
dfKeep$value = sub("\\$", "", as.character(dfKeep$value))
dfKeep$value = sub(" ", "", as.character(dfKeep$value))
dfKeep$value = as.numeric(as.character(dfKeep$value))
table = melt(xtabs(data = dfKeep, value ~ .))
table = subset(table, value>0)
# assign node numbers to each target and source
nodesDF = createNodes(list(table$srce), list(table$target))
# create sankey data to write to JSON
sourceTargetValue = sourceTargetValue(table, nodesDF)
sankeyJSON = createSankeyJSON(nodesDF, sourceTargetValue)
sankeyJSON
}
# create sankey data for more than 2 categorical variables (multiple paths)
createSankeyDataMulti = function (df, sourceTargetList, value) {
tableAll = NULL
for (i in 1:(length(sourceTargetList)-1)) {
keep = c(sourceTargetList[i:(i+1)], value)
dfKeep = df[keep]
names(dfKeep) = c("srce", "target", "value")
dfKeep$value = sub("\\$", "", as.character(dfKeep$value))
dfKeep$value = sub(" ", "", as.character(dfKeep$value))
dfKeep$value = as.numeric(as.character(dfKeep$value))
table = melt(xtabs(data = dfKeep, value ~ .))
table = subset(table, value>0)
tableAll = rbind(tableAll, table)
}
dfLevels = df[sourceTargetList]
nodesDF = NULL
count = 0
for (i in 1:length(sourceTargetList)) {
lst = createNodesMulti(dfLevels[[i]], count)
nodesDF = rbind(nodesDF, lst)
count = length(nodesDF[[1]])
}
# create sankey data to write to JSON
sourceTargetValue = sourceTargetValue(tableAll, nodesDF)
sankeyJSON = createSankeyJSON(nodesDF, sourceTargetValue)
sankeyJSON
}
# create sankey data for either single or multiple paths, determined by input
createSankeyData = function (df, sourceTargetList, value) {
if (length(sourceTargetList) == 2) {
sankeyJSON = createSankeyDataSingle(df, sourceTargetList, value)
} else if (length(sourceTargetList) > 2) {
sankeyJSON = createSankeyDataMulti(df, sourceTargetList, value)
} else {
cat("Make sure 'sourceTargetList' has at least 2 elements")
}
sankeyJSON
}
##################################################################################################
### MAIN #########################################################################################
##################################################################################################
# import and clean data
df<- read.csv(file)
# create sankey data
sankeyJSON = createSankeyData(df, sourceTargetList, value)
# write sankey data to JSON file
write(sankeyJSON, outputPath)
##################################################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment