Skip to content

Instantly share code, notes, and snippets.

@bxshi
Last active March 22, 2024 19:02
Show Gist options
  • Save bxshi/b9ce0bbabc458447d1d5 to your computer and use it in GitHub Desktop.
Save bxshi/b9ce0bbabc458447d1d5 to your computer and use it in GitHub Desktop.
R port of graph similarity algorithm DeltaCon
####### Please Check github.com/bxshi/rdsg for new updates, this gist will not be change in the future #######
## However, you can still use this since it is correct so far ##
# Port of original DeltaCon to R
# Baoxu(Dash) Shi
# Data Sciense Group
# University of Notre Dame
#
# Citation
# D. Koutra, J. T. Vogelstein, and C. Faloutsos:
# DeltaCon: A Principled Massive-Graph Similarity Function.
# SIAM 2013: 162–170.
# D. Koutra, T. Ke, U. Kang, D. H. Chau, H. K. Pao, C. Faloutsos:
# Unifying Guilt-by-Association Approaches: Theorems and Fast Algorithms.
# ECML/PKDD (2) 2011: 245-260
library(Matrix)
library(SparseM)
library(pracma)
.MAX_POWER = 10
.p = 0.51
output_time <- function(debug, tim, s) {
if(debug) {
print(paste(s, "user time:", tim[1], "system time:", tim[2], "elapsed time:", tim[3]))
}
}
inverse_lbp <- function(graph, nnodes, priors=NULL, times = 10, debug = FALSE) {
# Sparse identity matrix
I = NULL
tim <- system.time(
{
I <- .sparseDiagonal(nnodes, x = 1, shape = "t")
})
output_time(debug, tim, "Create sparse identity matrix")
# Sparse degree-diagonal matrix, D[i,i] = sum(graph[i,])
x = NULL
tim <- system.time(
{
x <- rowSums(graph, sparseResult = TRUE)
})
output_time(debug, tim, "Calculate degree of each node")
D = NULL
tim <- system.time(
{
D <- sparseMatrix(c(1:nnodes), c(1:nnodes), x = x, dims = c(nnodes, nnodes))
})
output_time(debug, tim, "Create degree diagonal matrix")
# Compute about-half homophily factor to guarantee covergence
c1 = sum(D) + 2
c2 = sum(D^2) - 1
h_h = sqrt((-c1 + sqrt(c1^2 + 4 * c2)) / (8 * c2))
# Compute constant ah and ch
ah = 4 * h_h^2 / (1 - 4 * h_h^2)
ch = 2 * h_h / (1 - 4 * h_h^2)
# Invert matrices M1 and M2
M = NULL
tim <-system.time({
M = ch * graph - ah * D
})
output_time(debug, tim, "Initialize Invert matrix")
# Calculate inverse of M
if (is.null(priors)) {
inv_ = I
mat_ = M
pow = 1
tim <- system.time({
while(max(mat_) > 1e-09 && pow < .MAX_POWER) {
inv_ = inv_ + mat_
mat_ = mat_ %*% M
pow = pow + 1
}
})
output_time(debug, tim, "Invert of matrix")
return(inv_)
} else {
final_mat <- NULL
tim <- system.time({
for(i in c(1:times)) {
inv_ = matrix(priors[, i], nnodes, 1)
mat_ = matrix(priors[, i], nnodes, 1)
pow = 1
while(max(mat_) > 1e-09 && pow < .MAX_POWER) {
mat_ = M %*% mat_
inv_ = inv_ + mat_
pow = pow + 1
}
if (i == 1) {
final_mat <- matrix(inv_)
} else {
final_mat <- cbind(final_mat, matrix(inv_))
}
}
})
output_time(debug, tim, "Approximate invert of matrix")
return(final_mat)
}
}
init_priors_percent <- function(percent, nnodes) {
times <- ceiling(1 / percent)
init_nodes <- floor(percent * nnodes)
rand_vector <- runif(nnodes)
rand_mat <- repmat(matrix(rand_vector, nnodes, 1), 1, times)
for(i in c(1:times)) {
rand_mat[rand_mat[,i] >= (i-1)*percent & rand_mat[,i] < i *percent, i] <- 1
rand_mat[rand_mat[,i] != 1, i] <- 0
}
return(rand_mat)
}
delta_con <- function(graph1, graph2, nnodes,
method = "naive", percent = 0.1, debug = FALSE, symmetrical = TRUE) {
if(ncol(graph1)!=2 || ncol(graph2)!=2) {
print("Input file should be data.frame with two cols(src dst).")
return(0);
}
colnames(graph1) <- c("src", "dst")
colnames(graph2) <- c("src", "dst")
# Construct sparse adjacent matrix from edge list
node_vector <- NULL
tim <- system.time({
g1 <- sparseMatrix(graph1$src, graph1$dst, x=1, dims=c(nnodes, nnodes))
g2 <- sparseMatrix(graph2$src, graph2$dst, x=1, dims=c(nnodes, nnodes))
})
output_time(debug, tim, "Construct sparse adjacent matrix")
# Change directed graph to undirected
if(symmetrical) {
tim <- system.time({
g1 <- g1 + t(g1)
g2 <- g2 + t(g2)
})
output_time(debug, tim, "Change to directed graph")
}
if (method == "fast") {
# Number of groups to be initialized
ngroups <- ceiling(1 / percent)
repetitions <- 10
t_all <- rep(0, repetitions)
sim <- rep(0, repetitions)
for(i in c(1:repetitions)) {
priors <- NULL
tim <- system.time({
priors <- init_priors_percent(percent, nnodes)
})
output_time(debug, tim, "Calculate priors")
inv1 <- inverse_lbp(g1, nnodes, priors, ngroups, debug = debug) * (.p - 0.5)
inv2 <- inverse_lbp(g2, nnodes, priors, ngroups, debug = debug) * (.p - 0.5)
sim[i] <- 1 / 1 / (1 + sqrt(sum( (sqrt(inv1) - sqrt(inv2))^2 )))
}
delta_con <- mean(sim)
return(delta_con)
} else {
# Naive FaBP
inv1 <- inverse_lbp(g1, nnodes, debug = debug) * (.p - 0.5)
inv2 <- inverse_lbp(g2, nnodes, debug = debug) * (.p - 0.5)
# Compute DeltaCon similarity score
delta_con <- 1 / (1 + sqrt(sum( (sqrt(inv1) - sqrt(inv2))^2 )))
return(delta_con)
}
}
delta_con_example <- function() {
g1 <- as.data.frame(cbind(c(1,1,2,2,3,3,4,8,5),
c(2,3,4,5,6,7,8,9,10)))
g2 <- as.data.frame(cbind(c(1,1,2,2,3,3,4,8,5,9,10,5,6),
c(2,3,4,5,6,7,8,9,10,11,11,12,12)))
print(delta_con(g1,g1,max(g1,g2), debug = TRUE))
print(delta_con(g1,g2,max(g1,g2), debug = TRUE))
print(delta_con(g2,g1,max(g1,g2), debug = TRUE))
print(delta_con(g2,g2,max(g1,g2), debug = TRUE))
print(delta_con(g1,g1,max(g1,g2), node_list = c(1,2,3,4,5,6,7,8,9,10), method="fast", debug = TRUE))
print(delta_con(g1,g2,max(g1,g2), node_list = c(1,2,3,4,5,6,7,8,9,10), method="fast", debug = TRUE))
print(delta_con(g2,g1,max(g1,g2), method="fast", debug = TRUE))
print(delta_con(g2,g2,max(g1,g2), method="fast", debug = TRUE))
print(delta_con(g1,g1,max(g1,g2), method="fast", symmetrical = FALSE))
print(delta_con(g1,g2,max(g1,g2), method="fast", symmetrical = FALSE))
print(delta_con(g2,g1,max(g1,g2), method="fast", symmetrical = FALSE))
print(delta_con(g2,g2,max(g1,g2), method="fast", symmetrical = FALSE))
print(delta_con(g1,g1,max(g1,g2), symmetrical = FALSE))
print(delta_con(g1,g2,max(g1,g2), symmetrical = FALSE))
print(delta_con(g2,g1,max(g1,g2), symmetrical = FALSE))
print(delta_con(g2,g2,max(g1,g2), symmetrical = FALSE))
g3 <- as.data.frame(cbind(c(2,3),
c(1,1)))
g4 <- as.data.frame(cbind(c(2,3),
c(4,4)))
g5 <- as.data.frame(cbind(c(2,3,3,1,5,6,6),
c(1,1,2,4,4,4,5)))
g6 <- as.data.frame(cbind(c(2,3,3,5,6,6),
c(1,1,2,4,4,5)))
}
@bxshi
Copy link
Author

bxshi commented Oct 17, 2014

Use method="fast" if you have a graph with more than 1million nodes, it works perfectly!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment