johnlees/pneumo_wgcna.R

## pneumo_wgcna.R
require(WGCNA)

# See:
# https://labs.genetics.ucla.edu/horvath/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/FemaleLiver-02-networkConstr-man.pdf

SPV_2_name = read.delim("SPV_2_name.txt", header=FALSE)
colnames(SPV_2_name) = c("SPV", "gene")

# co-expression matrix from Veening lab
# https://www.biorxiv.org/content/early/2018/03/22/283739
# data: https://zenodo.org/record/1157923

# once
`18_matrix_long_5` <- read.csv("18_matrix_long_5.csv", stringsAsFactors=FALSE)
`18_matrix_long_4` <- read.csv("18_matrix_long_4.csv", stringsAsFactors=FALSE)
`18_matrix_long_3` <- read.csv("18_matrix_long_3.csv", stringsAsFactors=FALSE)
`18_matrix_long_2` <- read.csv("18_matrix_long_2.csv", stringsAsFactors=FALSE)
`18_matrix_long_1` <- read.csv("18_matrix_long_1.csv", stringsAsFactors=FALSE)
all = rbind(`18_matrix_long_1`, `18_matrix_long_2`, `18_matrix_long_3`, `18_matrix_long_4`, `18_matrix_long_5`)
all_M = matrix(nrow = 2152, ncol = 2152, dimnames=list(origin=unique(all$gene1), dev=unique(all$gene2)))
all_M[cbind(all$gene1, all$gene2)] = all$correlation
saveRDS(all_M, "coexpression_matrix.Rds")
# subsequently
all_M = readRDS("coexpression_matrix.Rds")

# Run WGCNA as in tutorial
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold.fromSimilarity(all_M, powerVector = powers, verbose = 5)

sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 = 0.9

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

softPower = 5
adjacency = adjacency.fromSimilarity(all_M, power = softPower)

# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM

# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)

# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 10;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                            deepSplit = 2, pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);
table(dynamicMods)

# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")

colorOrder = c("grey", standardColors(50))
moduleLabels = match(dynamicColors, colorOrder)-1
modules = data.frame(SPV=rownames(all_M), module=moduleLabels)

labelled_modules = merge(modules, SPV_2_name, by.x = "SPV", by.y = "SPV", all.x = TRUE)
write.table(labelled_modules, "WGCNA_modules.tsv", quote = F, sep = "\t", row.names = F, col.names = F)

#### !
# Can't do this with similarity
#### !
# Not run

# Calculate eigengenes
MEList = moduleEigengenes(all_M, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
	require(WGCNA)

	# See:
	# https://labs.genetics.ucla.edu/horvath/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/FemaleLiver-02-networkConstr-man.pdf

	SPV_2_name = read.delim("SPV_2_name.txt", header=FALSE)
	colnames(SPV_2_name) = c("SPV", "gene")

	# co-expression matrix from Veening lab
	# https://www.biorxiv.org/content/early/2018/03/22/283739
	# data: https://zenodo.org/record/1157923

	# once
	`18_matrix_long_5` <- read.csv("18_matrix_long_5.csv", stringsAsFactors=FALSE)
	`18_matrix_long_4` <- read.csv("18_matrix_long_4.csv", stringsAsFactors=FALSE)
	`18_matrix_long_3` <- read.csv("18_matrix_long_3.csv", stringsAsFactors=FALSE)
	`18_matrix_long_2` <- read.csv("18_matrix_long_2.csv", stringsAsFactors=FALSE)
	`18_matrix_long_1` <- read.csv("18_matrix_long_1.csv", stringsAsFactors=FALSE)
	all = rbind(`18_matrix_long_1`, `18_matrix_long_2`, `18_matrix_long_3`, `18_matrix_long_4`, `18_matrix_long_5`)
	all_M = matrix(nrow = 2152, ncol = 2152, dimnames=list(origin=unique(all$gene1), dev=unique(all$gene2)))
	all_M[cbind(all$gene1, all$gene2)] = all$correlation
	saveRDS(all_M, "coexpression_matrix.Rds")
	# subsequently
	all_M = readRDS("coexpression_matrix.Rds")

	# Run WGCNA as in tutorial
	powers = c(c(1:10), seq(from = 12, to=20, by=2))
	sft = pickSoftThreshold.fromSimilarity(all_M, powerVector = powers, verbose = 5)

	sizeGrWindow(9, 5)
	par(mfrow = c(1,2))
	cex1 = 0.9

	plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
	xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
	main = paste("Scale independence"))
	text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
	labels=powers,cex=cex1,col="red")
	# this line corresponds to using an R^2 cut-off of h
	abline(h=0.90,col="red")
	# Mean connectivity as a function of the soft-thresholding power
	plot(sft$fitIndices[,1], sft$fitIndices[,5],
	xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
	main = paste("Mean connectivity"))
	text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

	softPower = 5
	adjacency = adjacency.fromSimilarity(all_M, power = softPower)

	# Turn adjacency into topological overlap
	TOM = TOMsimilarity(adjacency);
	dissTOM = 1-TOM

	# Call the hierarchical clustering function
	geneTree = hclust(as.dist(dissTOM), method = "average")
	# Plot the resulting clustering tree (dendrogram)
	sizeGrWindow(12,9)
	plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
	labels = FALSE, hang = 0.04)

	# We like large modules, so we set the minimum module size relatively high:
	minModuleSize = 10;
	# Module identification using dynamic tree cut:
	dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
	deepSplit = 2, pamRespectsDendro = FALSE,
	minClusterSize = minModuleSize);
	table(dynamicMods)

	# Convert numeric lables into colors
	dynamicColors = labels2colors(dynamicMods)
	table(dynamicColors)
	# Plot the dendrogram and colors underneath
	sizeGrWindow(8,6)
	plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
	dendroLabels = FALSE, hang = 0.03,
	addGuide = TRUE, guideHang = 0.05,
	main = "Gene dendrogram and module colors")

	colorOrder = c("grey", standardColors(50))
	moduleLabels = match(dynamicColors, colorOrder)-1
	modules = data.frame(SPV=rownames(all_M), module=moduleLabels)

	labelled_modules = merge(modules, SPV_2_name, by.x = "SPV", by.y = "SPV", all.x = TRUE)
	write.table(labelled_modules, "WGCNA_modules.tsv", quote = F, sep = "\t", row.names = F, col.names = F)

	#### !
	# Can't do this with similarity
	#### !
	# Not run

	# Calculate eigengenes
	MEList = moduleEigengenes(all_M, colors = dynamicColors)
	MEs = MEList$eigengenes
	# Calculate dissimilarity of module eigengenes
	MEDiss = 1-cor(MEs);
	# Cluster module eigengenes
	METree = hclust(as.dist(MEDiss), method = "average");
	# Plot the result
	sizeGrWindow(7, 6)
	plot(METree, main = "Clustering of module eigengenes",
	xlab = "", sub = "")