Selbosh/community.R Secret

## community.R
set.seed(2017)
C <- scrooge::citations
n <- nrow(C)

#G <- sort(sample.int(n, floor(n/2))) # split into two communities
#G <- list(c1 = G, c2 = setdiff(1:n, G))
G <- split(1:n, cutree(hclust(as.dist(cor(C))), h = .85)) # hierarchical clustering into 4 groups
ngroups <- length(G)

Y <- as.matrix(cbind(win1 = t(C)[lower.tri(C)],
                     win2 = C[lower.tri(C)]))
npairs <- nrow(Y)

get_comm <- function(xs, comm) {
  # Given a vector of journals `xs` and a membership list `comm`,
  # return the community ID of each element of `xs`
  in_list <- function(e, l) which(sapply(l, function(li) e %in% li))
  idx <- sapply(xs, function(x) in_list(x, comm))
  as.vector(idx)
}

X <- matrix(0, npairs, n + ngroups)
X[cbind(1:npairs, col(C)[lower.tri(C)])] <- 1 # winning journal
X[cbind(1:npairs, row(C)[lower.tri(C)])] <- -1 # losing journal
X[cbind(1:npairs, n + get_comm(col(C)[lower.tri(C)], G))] <- 1 # winning community
X[cbind(1:npairs, n + get_comm(row(C)[lower.tri(C)], G))] <- -1 # losing community
colnames(X) <- varNames <- c(colnames(C), paste0('Community', 1:ngroups))
X <- X[, -1] # First variable is redundant
model <- glm(Y ~ 0 + X, family = binomial)
summary(model)

# Get vector of ALL coefficients s.t. mean = 0
mu <- setNames(c(0, coef(model)), varNames)
mu <- mu - mean(mu)

# Convert into alpha(i, c) parametrisation
alpha <- outer(mu[1:n], mu[(n+1):(n+ngroups)], '-')
(alpha <- alpha - mean(alpha))

#-----------------------------------------------------------
# The above design matrix assumes additive effects,
# which isn't really what we discussed/are interested in.
# What if there are actually n * ngroups coefs?
# ----------------------------------------------------------

X2 <- matrix(0, npairs, n * ngroups)
X2[cbind(1:npairs, col(C)[lower.tri(C)] + n * (get_comm(row(C)[lower.tri(C)], G) - 1))] <- 1 # winning journal : losing community
X2[cbind(1:npairs, row(C)[lower.tri(C)] + n * (get_comm(col(C)[lower.tri(C)], G) - 1))] <- -1 # losing journal : winning community
colnames(X2) <- varNames2 <- c(outer(colnames(C), paste0('C', 1:ngroups), paste, sep = '__'))
X2 <- X2[, -1] # get rid of (at least) one redundant column (?)

model2 <- glm(Y ~ 0 + X2, family = binomial) ## Not sure if this really works yet. Try `?alias` to find redundancies
summary(model2)

mu2 <- setNames(c(0, coef(model2)), varNames2)
(mu2 <- mu2 - mean(mu2, na.rm = TRUE)) ## Possibly meaningless due to many NAs

# NB: this method doesn't like communities of size 1, because they will be entirely redundant
	set.seed(2017)
	C <- scrooge::citations
	n <- nrow(C)

	#G <- sort(sample.int(n, floor(n/2))) # split into two communities
	#G <- list(c1 = G, c2 = setdiff(1:n, G))
	G <- split(1:n, cutree(hclust(as.dist(cor(C))), h = .85)) # hierarchical clustering into 4 groups
	ngroups <- length(G)

	Y <- as.matrix(cbind(win1 = t(C)[lower.tri(C)],
	win2 = C[lower.tri(C)]))
	npairs <- nrow(Y)

	get_comm <- function(xs, comm) {
	# Given a vector of journals `xs` and a membership list `comm`,
	# return the community ID of each element of `xs`
	in_list <- function(e, l) which(sapply(l, function(li) e %in% li))
	idx <- sapply(xs, function(x) in_list(x, comm))
	as.vector(idx)
	}

	X <- matrix(0, npairs, n + ngroups)
	X[cbind(1:npairs, col(C)[lower.tri(C)])] <- 1 # winning journal
	X[cbind(1:npairs, row(C)[lower.tri(C)])] <- -1 # losing journal
	X[cbind(1:npairs, n + get_comm(col(C)[lower.tri(C)], G))] <- 1 # winning community
	X[cbind(1:npairs, n + get_comm(row(C)[lower.tri(C)], G))] <- -1 # losing community
	colnames(X) <- varNames <- c(colnames(C), paste0('Community', 1:ngroups))
	X <- X[, -1] # First variable is redundant
	model <- glm(Y ~ 0 + X, family = binomial)
	summary(model)

	# Get vector of ALL coefficients s.t. mean = 0
	mu <- setNames(c(0, coef(model)), varNames)
	mu <- mu - mean(mu)

	# Convert into alpha(i, c) parametrisation
	alpha <- outer(mu[1:n], mu[(n+1):(n+ngroups)], '-')
	(alpha <- alpha - mean(alpha))

	#-----------------------------------------------------------
	# The above design matrix assumes additive effects,
	# which isn't really what we discussed/are interested in.
	# What if there are actually n * ngroups coefs?
	# ----------------------------------------------------------

	X2 <- matrix(0, npairs, n * ngroups)
	X2[cbind(1:npairs, col(C)[lower.tri(C)] + n * (get_comm(row(C)[lower.tri(C)], G) - 1))] <- 1 # winning journal : losing community
	X2[cbind(1:npairs, row(C)[lower.tri(C)] + n * (get_comm(col(C)[lower.tri(C)], G) - 1))] <- -1 # losing journal : winning community
	colnames(X2) <- varNames2 <- c(outer(colnames(C), paste0('C', 1:ngroups), paste, sep = '__'))
	X2 <- X2[, -1] # get rid of (at least) one redundant column (?)

	model2 <- glm(Y ~ 0 + X2, family = binomial) ## Not sure if this really works yet. Try `?alias` to find redundancies
	summary(model2)

	mu2 <- setNames(c(0, coef(model2)), varNames2)
	(mu2 <- mu2 - mean(mu2, na.rm = TRUE)) ## Possibly meaningless due to many NAs

	# NB: this method doesn't like communities of size 1, because they will be entirely redundant