Created
June 5, 2015 23:55
-
-
Save jdesilvio/96b67d542eea8e41d688 to your computer and use it in GitHub Desktop.
This script is designed to create a JSON formatted for the D3 Sankey Diagram
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################################## | |
### This script is designed to create a JSON formatted for the D3 Sankey Diagram. The JSON ### | |
### consists of nodes and links. The script can take in 2 or more categorical variables and ### | |
### and 1 numerical value variable. ### | |
################################################################################################## | |
# import libraries | |
library(reshape2) | |
library(plyr) | |
library(rjson) | |
################################################################################################## | |
### FUNCTIONS #################################################################################### | |
################################################################################################## | |
# create nodes for each node name (single path) | |
createNodes = function(srceList, targetList, nodeStart=0, ...) { | |
df = cbind(targetList, srceList) | |
names(df) = c("target", "srce") | |
nodesDF = as.data.frame(unique(c(list(as.character(df$srce))[[1]], list(as.character(df$target))[[1]]))) | |
names(nodesDF)[1] = "name" | |
nodesDF$node = seq(nodeStart, nodeStart+length(nodesDF$name)-1) | |
nodesDF | |
} | |
# create nodes for each node name (multiple paths) | |
createNodesMulti = function(srceList, nodeStart=0, ...) { | |
nodesDF = as.data.frame(unique(c(list(as.character(srceList))[[1]]))) | |
names(nodesDF)[1] = "name" | |
nodesDF$node = seq(nodeStart, nodeStart+length(nodesDF$name)-1) | |
nodesDF | |
} | |
# create a data frame consisting of sources, targets and values | |
sourceTargetValue = function (df, nodesDF) { | |
stv = df | |
srces = as.data.frame(stv$srce); names(srces) = "name" | |
targets = as.data.frame(stv$target); names(targets) = "name" | |
#add node numbers to df | |
srce_node = join(srces, nodesDF) | |
target_node = join(targets, nodesDF) | |
stv$srceNode = srce_node$node | |
stv$targetNode = target_node$node | |
stv | |
} | |
# format data into a JSON | |
createSankeyJSON = function(nodesDF, sourceTargetValue) { | |
#create seperate node and link JSONs | |
nodesList = sprintf('\n\t\t{"node":%s, "name":"%s"}', | |
nodesDF$node, nodesDF$name) | |
linksList = sprintf('\n\t\t{"source":%s, "target":%s, "value":%s}', | |
sourceTargetValue$srceNode, sourceTargetValue$targetNode, sourceTargetValue$value) | |
#combine into a sankey JSON | |
sankeyJSON = sprintf('{\n\t"nodes": [%s\n\t],\n\t"links": [%s\n\t]\n}', | |
toString(nodesList), toString(linksList)) | |
sankeyJSON | |
} | |
# create sankey data for 2 categorical variables (1 path) | |
createSankeyDataSingle = function (df, sourceTargetList, value) { | |
keep = c(sourceTargetList, value) | |
dfKeep = df[keep] | |
names(dfKeep) = c("srce", "target", "value") | |
dfKeep$value = sub("\\$", "", as.character(dfKeep$value)) | |
dfKeep$value = sub(" ", "", as.character(dfKeep$value)) | |
dfKeep$value = as.numeric(as.character(dfKeep$value)) | |
table = melt(xtabs(data = dfKeep, value ~ .)) | |
table = subset(table, value>0) | |
# assign node numbers to each target and source | |
nodesDF = createNodes(list(table$srce), list(table$target)) | |
# create sankey data to write to JSON | |
sourceTargetValue = sourceTargetValue(table, nodesDF) | |
sankeyJSON = createSankeyJSON(nodesDF, sourceTargetValue) | |
sankeyJSON | |
} | |
# create sankey data for more than 2 categorical variables (multiple paths) | |
createSankeyDataMulti = function (df, sourceTargetList, value) { | |
tableAll = NULL | |
for (i in 1:(length(sourceTargetList)-1)) { | |
keep = c(sourceTargetList[i:(i+1)], value) | |
dfKeep = df[keep] | |
names(dfKeep) = c("srce", "target", "value") | |
dfKeep$value = sub("\\$", "", as.character(dfKeep$value)) | |
dfKeep$value = sub(" ", "", as.character(dfKeep$value)) | |
dfKeep$value = as.numeric(as.character(dfKeep$value)) | |
table = melt(xtabs(data = dfKeep, value ~ .)) | |
table = subset(table, value>0) | |
tableAll = rbind(tableAll, table) | |
} | |
dfLevels = df[sourceTargetList] | |
nodesDF = NULL | |
count = 0 | |
for (i in 1:length(sourceTargetList)) { | |
lst = createNodesMulti(dfLevels[[i]], count) | |
nodesDF = rbind(nodesDF, lst) | |
count = length(nodesDF[[1]]) | |
} | |
# create sankey data to write to JSON | |
sourceTargetValue = sourceTargetValue(tableAll, nodesDF) | |
sankeyJSON = createSankeyJSON(nodesDF, sourceTargetValue) | |
sankeyJSON | |
} | |
# create sankey data for either single or multiple paths, determined by input | |
createSankeyData = function (df, sourceTargetList, value) { | |
if (length(sourceTargetList) == 2) { | |
sankeyJSON = createSankeyDataSingle(df, sourceTargetList, value) | |
} else if (length(sourceTargetList) > 2) { | |
sankeyJSON = createSankeyDataMulti(df, sourceTargetList, value) | |
} else { | |
cat("Make sure 'sourceTargetList' has at least 2 elements") | |
} | |
sankeyJSON | |
} | |
################################################################################################## | |
### MAIN ######################################################################################### | |
################################################################################################## | |
# import and clean data | |
df<- read.csv(file) | |
# create sankey data | |
sankeyJSON = createSankeyData(df, sourceTargetList, value) | |
# write sankey data to JSON file | |
write(sankeyJSON, outputPath) | |
################################################################################################## |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment