Last active
January 3, 2016 05:29
-
-
Save computermacgyver/8416496 to your computer and use it in GitHub Desktop.
Analysis of Twitter mentions/retweets network for #CHI2014 paper "Global Connectivity and Multilinguals in the Twitter Network". #R #igraph
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/R | |
# Main analysis for "Global Connectivity and Multilinguals in the Twitter Network" | |
# paper. See http://www.scotthale.net/pubs/?chi2014 for details. | |
# | |
# Author: Scott A. Hale (http://www.scotthale.net/) | |
# License: GPLv2 | |
# If you use this in support of an academic publication, please cite: | |
# | |
# Hale, S. A. (2014) Global Connectivity and Multilinguals in the Twitter Network. | |
# In Proceedings of the 2014 ACM Annual Conference on Human Factors in Computing Systems, | |
# ACM (Montreal, Canada). | |
# | |
# More details, related code, and the original academic paper using this code | |
# is available at http://www.scotthale.net/pubs/?chi2014 | |
# | |
# Requires density_functions.R available at https://gist.github.com/computermacgyver/8416453 | |
library(igraph) | |
library(ggplot2) | |
library(scales) | |
library(gridExtra) | |
library(plyr) | |
library(parallel) | |
library(foreach) | |
library(doParallel) | |
source("density_functions.R") #This file available at https://gist.github.com/computermacgyver/8416453 | |
#if non-interactive and --save flag use the following to save the enviornment before quitting on an error | |
#options(error = quote(q("yes"))) | |
NUM.CORES<-5 | |
NUM.RUNS<-100 | |
if (NUM.RUNS<NUM.CORES) { | |
NUM.CORES<-NUM.RUNS#Only use one core if we are running only once | |
} | |
#TODO: Consider average time between tweets -- remove lowest quartile as bots (?) | |
#TODO: Combine Indonesian and Malay together! | |
g<-read.graph("../../hadoop-twitter/graph_ge4_ge2-20_clean-msin_anon.graphml",format="graphml") | |
components<-decompose.graph(g,mode="weak", min.vertices = 100) | |
gLCC<-components[[1]] | |
rm(g) | |
pngWidth<-800 | |
pngHeight<-700 | |
svgWidth<-8 | |
svgHeight<-8 | |
summary(V(gLCC)$tweetCount) | |
# summary(V(gLCC)$tweetCount) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 1.000 1.000 2.000 3.615 4.000 343.000 | |
#Filter to require at least one *in* edge | |
V(gLCC)$inDegree<-degree(gLCC, mode = "in") | |
summary(V(gLCC)$inDegree) | |
gMin5<-induced.subgraph(gLCC,which(V(gLCC)$inDegree>0 & V(gLCC)$tweetCountAdj>0 & V(gLCC)$tweetCount>=4)) | |
components<-decompose.graph(gMin5,mode="weak", min.vertices = 100) | |
gMin5LCC<-components[[1]] | |
#rm(gLCC) | |
rm(gMin5) | |
#clusters<-edge.betweenness.community(gMin5LCC) | |
#sizes(clusters) | |
summary(gMin5LCC) | |
gMin5LCC<-simplify(gMin5LCC,remove.loops=TRUE,remove.multiple=TRUE,edge.attr.comb="sum") | |
summary(gMin5LCC) | |
gMin5LCCUndirected<-as.undirected(gMin5LCC, mode = "collapse") | |
summary(gMin5LCCUndirected) | |
summary(V(gMin5LCCUndirected)$majLangPercentAdj) | |
length(V(gMin5LCCUndirected)$majLangPercentAdj) | |
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj==1]) | |
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj!=1]) | |
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj!=1])/length(V(gMin5LCCUndirected)$majLangPercentAdj) | |
############################## | |
## Summary ## | |
############################### | |
write.graph(gMin5LCC,"edges.list",format="edgelist") | |
write(V(gMin5LCC)$majLangAdj,"langclean.txt") | |
write.csv(V(gMin5LCC)$label,"usernames.txt",row.names=FALSE)#Untested | |
V(gMin5LCC)$inDegree<-degree(gMin5LCC, mode = "in") | |
V(gMin5LCC)$outDegree<-degree(gMin5LCC, mode = "out") | |
V(gMin5LCC)$constraint<-constraint(gMin5LCC) | |
#V(gMin5LCC)$betweenness<-betweenness(gMin5LCC) | |
V(gMin5LCC)$multilingual[V(gMin5LCC)$majLangPercentAdj!=1]<-"Multilingual" | |
V(gMin5LCC)$multilingual[V(gMin5LCC)$majLangPercentAdj==1]<-"Monolingual" | |
#V(gMin5LCCUndirected)$multilingual<-as.factor(V(gMin5LCCUndirected)$multilingual) | |
table(V(gMin5LCC)$multilingual) | |
length(V(gMin5LCC)$multilingual) | |
sumStats<-data.frame(multilingual=V(gMin5LCC)$multilingual, | |
tweetCount=V(gMin5LCC)$tweetCountAdj, | |
inDegree=V(gMin5LCC)$inDegree, | |
outDegree=V(gMin5LCC)$outDegree, | |
majLangCount=V(gMin5LCC)$majLangCountAdj, | |
constraint=V(gMin5LCC)$constraint#,betweenness=V(gMin5LCC)$betweenness | |
) | |
#TODO: Remove outliers? | |
svg("tweetCount.svg",width=4,height=4) | |
plot <- ggplot(sumStats,aes(x=multilingual,y=tweetCount)) | |
plot <- plot + geom_boxplot() + scale_y_log10("Number of tweets") | |
plot <- plot + scale_x_discrete("") + theme_bw() | |
plot | |
dev.off() | |
svg("inDegree.svg",width=4,height=4) | |
plot <- ggplot(sumStats,aes(x=multilingual,y=inDegree)) | |
plot <- plot + geom_boxplot() + scale_y_log10("In-degree") | |
plot <- plot + scale_x_discrete("") + theme_bw() | |
plot | |
dev.off() | |
svg("outDegree.svg",width=4,height=4) | |
plot <- ggplot(sumStats,aes(x=multilingual,y=outDegree)) | |
plot <- plot + geom_boxplot() + scale_y_log10("Out-degree") | |
plot <- plot + scale_x_discrete("") + theme_bw() | |
plot | |
dev.off() | |
svg("majLangCount.svg",width=4,height=4) | |
plot <- ggplot(sumStats,aes(x=multilingual,y=majLangCount)) | |
plot <- plot + geom_boxplot() + scale_y_log10("Number of tweets in majority language") | |
plot <- plot + scale_x_discrete("") + theme_bw() | |
plot | |
dev.off() | |
ggplot(sumStats,aes(x=tweetCount,color=multilingual)) + geom_density() + scale_x_log10() | |
#TODO: Different linetypes for split variable??? (e.g., dashed for mono, solid for multi) | |
pTweetCount<-density.log(sumStats,"tweetCount","multilingual",n=2048,adjust=8,title="Number of tweets") | |
pIndegree<-density.log(sumStats,"inDegree","multilingual",n=2048,adjust=8,title="In-degree/visibility") | |
pOutdegree<-density.log(sumStats,"outDegree","multilingual",n=2048,adjust=8,title="Out-degree/reach") | |
pConstraint<-density.log(sumStats,"constraint","multilingual",n=2048,adjust=8,title="Burt's constraint") | |
#pBetweenness<-density.log(sumStats,"betweenness","multilingual",n=2048,adjust=8,title="Betweenness") | |
tmp <- ggplot_gtable(ggplot_build(pTweetCount$plot)) | |
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") | |
leg <- tmp$grobs[[leg]] | |
lheight <- sum(leg$height) | |
svg("introSummary2.svg",width=12,height=5) | |
grid.arrange( | |
pTweetCount$plot+theme(legend.position="none")+scale_x_log10("Number of tweets", | |
breaks = c(1,10,100), | |
labels = trans_format("log10", math_format(10^.x))), | |
pOutdegree$plot+theme(legend.position="none")+scale_x_log10("Out-degree/reach", | |
breaks = c(1,10,100), | |
labels = trans_format("log10", math_format(10^.x))), | |
pIndegree$plot+theme(legend.position="none")+scale_x_log10("In-degree/visibility", | |
breaks = c(1,10,100,1000,10000), | |
labels = trans_format("log10", math_format(10^.x))), | |
#Row 2 | |
textGrob(""), leg, ncol=3,ncol=1,heights=unit.c(unit(1, "npc") - lheight, lheight) | |
) | |
dev.off() | |
svg("introSummary.svg",width=12,height=5) | |
grid.arrange( | |
pTweetCount$plot+theme(legend.position="none"), | |
pOutdegree$plot+theme(legend.position="none"), | |
pIndegree$plot+theme(legend.position="none"), | |
#Row 2 | |
textGrob(""), leg, ncol=3,ncol=1,heights=unit.c(unit(1, "npc") - lheight, lheight) | |
) | |
dev.off() | |
#plot <- ggplot(subset(sumStats,tweetCount>=4),aes(x=tweetCount,y=inDegree,group=multilingual,color=multilingual)) | |
#plot <- plot + geom_point() + scale_y_log10("Number of tweets in majority language") | |
#plot <- plot + scale_x_log10("") + theme_bw() | |
#plot <- ggplot(subset(sumStats,multilingual=="Multilingual"),aes(x=majLangPercent)) + geom_hist() | |
summary(sumStats$tweetCount[sumStats$multilingual=="Monolingual"]) | |
sd(sumStats$tweetCount[sumStats$multilingual=="Monolingual"]) | |
summary(sumStats$tweetCount[sumStats$multilingual=="Multilingual"]) | |
sd(sumStats$tweetCount[sumStats$multilingual=="Multilingual"]) | |
t.test(sumStats$tweetCount[sumStats$multilingual=="Monolingual"],sumStats$tweetCount[sumStats$multilingual=="Multilingual"]) | |
wilcox.test(sumStats$tweetCount~sumStats$multilingual) | |
kruskal.test(sumStats$tweetCount~sumStats$multilingual) | |
summary(sumStats$outDegree[sumStats$multilingual=="Monolingual"]) | |
sd(sumStats$outDegree[sumStats$multilingual=="Monolingual"]) | |
summary(sumStats$outDegree[sumStats$multilingual=="Multilingual"]) | |
sd(sumStats$outDegree[sumStats$multilingual=="Multilingual"]) | |
t.test(sumStats$outDegree[sumStats$multilingual=="Monolingual"],sumStats$outDegree[sumStats$multilingual=="Multilingual"]) | |
wilcox.test(sumStats$outDegree~sumStats$multilingual) | |
kruskal.test(sumStats$outDegree~sumStats$multilingual) | |
summary(sumStats$inDegree[sumStats$multilingual=="Monolingual"]) | |
sd(sumStats$inDegree[sumStats$multilingual=="Monolingual"]) | |
summary(sumStats$inDegree[sumStats$multilingual=="Multilingual"]) | |
sd(sumStats$inDegree[sumStats$multilingual=="Multilingual"]) | |
t.test(sumStats$inDegree[sumStats$multilingual=="Monolingual"],sumStats$inDegree[sumStats$multilingual=="Multilingual"]) | |
wilcox.test(sumStats$inDegree~sumStats$multilingual) | |
kruskal.test(sumStats$inDegree~sumStats$multilingual) | |
summary(sumStats$constraint[sumStats$multilingual=="Monolingual"]) | |
summary(sumStats$constraint[sumStats$multilingual=="Multilingual"]) | |
#summary(sumStats$betweenness[sumStats$multilingual=="Monolingual"]) | |
#summary(sumStats$betweenness[sumStats$multilingual=="Multilingual"]) | |
######################## | |
## Node removal -- This analysis is not included in the published paper. See Node removal simple. | |
######################## | |
#see games.c | |
#gRand<-rewire(gMin5LCCUndirected,mode="simple",niter=1000) | |
#OR | |
#Copy majLangPercentRank across based on degreeRank | |
#V(gMin5LCCUndirected)$degreeRank<-rank(V(gMin5LCCUndirected)$degree,ties.method="random") | |
#V(gRand)$degree<-degree(gRand) | |
#V(gRand)$degreeRank<-rank(V(gRand)$degree,ties.method="random") | |
#ranks<-V(gMin5LCCUndirected)$majLangPercentRank[order(V(gMin5LCCUndirected)$degreeRank)] | |
#V(gRand)$majLangPercentRank<-ranks[order(V(gRand)$degreeRank)] | |
V(gMin5LCCUndirected)$multilingual[V(gMin5LCCUndirected)$majLangPercentAdj!=1]<-"Multilingual" | |
V(gMin5LCCUndirected)$multilingual[V(gMin5LCCUndirected)$majLangPercentAdj==1]<-"Monolingual" | |
#TODO: Consider comparing to attack algorithm -- e.g. remove most central nodes first (by k-cores?) | |
# Betweenness centrality does not work as it changes as nodes are removed | |
# Could consdier comparing to removing highest degree first -- Did this (I think): multilinguals created less components than highest degree first | |
runNodeRemoval <- function(vals) { | |
# as many rows & cols as needed; don't know levels yet | |
GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
#vals<-seq(0.01,0.99,0.03)#c(.01,.02,.03,.04,.05,.06,.07,.08,.09,.1,.2,.3,.4,.5,.6,.7,.8,.9) | |
#vals<-seq(0.01,0.1,0.005) | |
N <- length(vals)*3 | |
df <- data.frame(val=rep(NA, N), components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), constraint=rep(NA, N), numEdges=rep(NA,N), numNodes=rep(NA,N), method=rep("", N), stringsAsFactors=FALSE) | |
i<-0 | |
#Multilinguals first | |
for (x in vals) { | |
print(x) | |
y<-x*GRAPHSIZE | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank>=y) | |
#print(summary(tmp)) | |
#clusters<-fastgreedy.community(tmp) | |
components<-clusters(tmp,mode="weak") | |
#print(components$no) | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multiFirst") | |
} | |
#Monolinguals first | |
for (x in vals) { | |
print(x) | |
y<-(1-x)*GRAPHSIZE | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank<=y) | |
#print(summary(tmp)) | |
#clusters<-fastgreedy.community(tmp) | |
components<-clusters(tmp,mode="weak") | |
#print(components$no) | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "monoFirst") | |
} | |
#Randomly permute majLangPercent and repeat the above | |
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank) | |
for (x in vals) { | |
print(x) | |
y<-x*GRAPHSIZE | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$permuted>=y) | |
#print(summary(tmp)) | |
components<-clusters(tmp,mode="weak") | |
#print(components$no) | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random") | |
} | |
#Compare to gRand | |
for (x in vals) { | |
print(x) | |
y<-x*GRAPHSIZE | |
tmp<-induced.subgraph(gRand,V(gRand)$majLangPercentRank>=y) | |
#print(summary(tmp)) | |
#clusters<-fastgreedy.community(tmp) | |
components<-clusters(tmp,mode="weak") | |
#print(components$no) | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random-rewire") | |
} | |
#High degree first | |
#V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank) | |
#for (x in vals) { | |
# print(x) | |
# y<-x*GRAPHSIZE | |
# tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$degreeRank>=y) | |
# print(summary(tmp)) | |
# components<-clusters(tmp,mode="weak") | |
# print(components$no) | |
# m<-max(components$csize) | |
# i<-i+1 | |
# df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), "highFirst") | |
#} | |
return(df) | |
} | |
if (NUM.CORES>1) { | |
cl <- makeCluster(NUM.CORES) | |
registerDoParallel(cl, cores = NUM.CORES) | |
} | |
dfAvg <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"), | |
.combine = rbind) %dopar% { | |
print(paste0("Run: ",run)) | |
GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
V(gMin5LCCUndirected)$degree<-degree(gMin5LCCUndirected, mode = "all") | |
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random") | |
gRand<-degree.sequence.game(degree(gMin5LCCUndirected),method="vl") | |
V(gRand)$majLangPercentRank<-V(gMin5LCCUndirected)$majLangPercentRank | |
df<-runNodeRemoval(seq(0.01,0.99,0.01)) | |
df$edgesRemoved<-ecount(gMin5LCCUndirected)-as.numeric(df$numEdges) | |
write.csv(df,paste0("output_df_randomOrder_withCounts_",run,".csv")) | |
return(df) | |
} | |
warnings() | |
if (NUM.CORES>1) { | |
stopCluster(cl) | |
} | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$multilingual=="Monolingual") | |
1-ecount(tmp)/ecount(gMin5LCCUndirected) | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentAdj>=.6) | |
1-ecount(tmp)/ecount(gMin5LCCUndirected) | |
sum(V(gMin5LCCUndirected)$multilingual=="Multilingual")/vcount(gMin5LCCUndirected) | |
sum(V(gMin5LCCUndirected)$majLangPercentAdj<.6)/vcount(gMin5LCCUndirected) | |
#dfZoom<-runNodeRemoval(seq(0.01,0.2,0.01)) | |
#write.csv(dfZoom,"output_dfZoom.csv") | |
#df<-read.csv("output2.csv") | |
df<-dfAvg | |
df$val<-as.numeric(df$val) | |
df$components<-as.numeric(df$components) | |
df$lccSize<-as.numeric(df$lccSize) | |
df$avgSize<-as.numeric(df$avgSize) | |
df$constraint<-as.numeric(df$constraint) | |
df$numEdges<-as.numeric(df$numEdges) | |
df$numNodes<-as.numeric(df$numNodes) | |
df$edgesRemoved<-as.numeric(df$edgesRemoved) | |
df<-ddply(df,.(val,method),summarize, | |
components=mean(components),lccSize=mean(lccSize), | |
avgSize=mean(avgSize),constraint=mean(constraint), | |
numEdges=mean(numEdges),numNodes=mean(numNodes),edgesRemoved=mean(edgesRemoved)) | |
write.csv(df,"output_df_randomOrder_withCounts_avg100.csv") | |
png("no_components.png",width=pngWidth,height=pngHeight) | |
plot<-ggplot(df,aes(x=val,y=components,color=method,group=method)) | |
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent) | |
plot | |
dev.off() | |
png("lccSize.png",width=pngWidth,height=pngHeight) | |
plot<-ggplot(df,aes(x=val,y=lccSize,color=method,group=method)) | |
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent) | |
plot | |
dev.off() | |
png("avgSize.png",width=pngWidth,height=pngHeight) | |
plot<-ggplot(df,aes(x=val,y=avgSize,color=method,group=method)) | |
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent) | |
plot | |
dev.off() | |
png("constraint.png",width=pngWidth,height=pngHeight) | |
plot<-ggplot(df,aes(x=val,y=constraint,color=method,group=method)) | |
plot<-plot + geom_line() | |
plot | |
dev.off() | |
######################## | |
## Node removal simple -- This is included | |
######################## | |
runNodeRemovalSimple <- function(gMin5LCCUndirected,gRand) { | |
nodesToRemove<-length(V(gMin5LCCUndirected)[V(gMin5LCCUndirected)$majLangPercentAdj<1]) | |
GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
N <- 4 | |
df <- data.frame(components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), constraint=rep(NA, N), numEdges=rep(NA,N), numNodes=rep(NA,N), method=rep("", N), stringsAsFactors=FALSE) | |
i<-0 | |
print("Multilinguals (all removed)") | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentAdj==1) | |
components<-clusters(tmp,mode="weak") | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multi") | |
print("Monolinguals (random subset)") | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank<=(GRAPHSIZE-nodesToRemove)) | |
components<-clusters(tmp,mode="weak") | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "mono") | |
#Randomly permute majLangPercent and repeat the above | |
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank) | |
print("Random subset") | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$permuted>nodesToRemove) | |
components<-clusters(tmp,mode="weak") | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random") | |
print("Multilinguals (all, edge rewiring)") | |
tmp<-induced.subgraph(gRand,V(gRand)$majLangPercentAdj==1) | |
components<-clusters(tmp,mode="weak") | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multiRewire") | |
print("Highest degree subset") | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$degreeRank>nodesToRemove) | |
components<-clusters(tmp,mode="weak") | |
m<-max(components$csize) | |
i<-i+1 | |
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "highDegree") | |
return(df) | |
} | |
if (NUM.CORES>1) { | |
cl <- makeCluster(NUM.CORES) | |
registerDoParallel(cl, cores = NUM.CORES) | |
} | |
dfNodeRemoval <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"), | |
.combine = rbind) %dopar% { | |
print(paste0("Run: ",run)) | |
GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
V(gMin5LCCUndirected)$degree<-degree(gMin5LCCUndirected, mode = "all") | |
V(gMin5LCCUndirected)$degreeRank<-GRAPHSIZE-rank(V(gMin5LCCUndirected)$degree,ties.method="random") | |
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random") | |
gRand<-degree.sequence.game(degree(gMin5LCCUndirected),method="vl") | |
V(gRand)$majLangPercentRank<-V(gMin5LCCUndirected)$majLangPercentRank | |
V(gRand)$majLangPercentAdj<-V(gMin5LCCUndirected)$majLangPercentAdj | |
df<-runNodeRemovalSimple(gMin5LCCUndirected,gRand) | |
df$edgesRemoved<-ecount(gMin5LCCUndirected)-as.numeric(df$numEdges) | |
df$run<-run | |
write.csv(df,paste0("output_dfNodeRemovalSimple_",run,".csv")) | |
return(df) | |
} | |
write.csv(dfNodeRemoval,"dfNodeRemovalSimple.csv") | |
warnings() | |
if (NUM.CORES>1) { | |
stopCluster(cl) | |
} | |
conf95<-function(df,var) { | |
m<-mean(df[,var]) | |
s<-sd(df[,var]) | |
tmp<-data.frame( | |
m, | |
s, | |
m+(1.96*s), | |
m-(1.96*s), | |
max(df[,var]), | |
min(df[,var]) | |
) | |
eval(names(tmp)<-c( | |
paste0(var,"Mean"),paste0(var,"SD"), | |
paste0(var,"95Max"),paste0(var,"95Min"), | |
paste0(var,"Max"),paste0(var,"Min") | |
)) | |
return(tmp) | |
} | |
meanSD<-function(df) { | |
tmp<-data.frame( | |
lccSize=mean(df$lccSize),lccSizeSD=sd(df$lccSize), | |
avgSize=mean(df$avgSize),avgSizeSD=sd(df$avgSize), | |
components=mean(df$components),componentsSD=sd(df$components) | |
) | |
lccSize<-conf95(df,"lccSize") | |
avgSize<-conf95(df,"avgSize") | |
components<-conf95(df,"components") | |
#print(tmp) | |
return(cbind(lccSize,avgSize,components)) | |
} | |
#dfNodeRemoval<-read.csv("dfNodeRemovalSimple.csv") | |
dfNodeRemovalAvg<-ddply(dfNodeRemoval,.(method),meanSD) | |
dfNodeRemovalAvg | |
dfNodeRemovalAvg<-subset(dfNodeRemovalAvg,method!="highDegree") | |
#bar chart | |
ggplot(dfNodeRemovalAvg,aes(x=method,y=componentsMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=components95Max,ymin=components95Min)) + theme_bw() | |
ggplot(dfNodeRemovalAvg,aes(x=method,y=lccSizeMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=lccSize95Max,ymin=lccSize95Min)) + theme_bw() | |
ggplot(dfNodeRemovalAvg,aes(x=method,y=avgSizeMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=avgSize95Max,ymin=avgSize95Min)) + theme_bw() | |
#boxplots | |
dfNodeRemoval<-subset(dfNodeRemoval,method!="highDegree") | |
dfNodeRemoval$label<-factor(dfNodeRemoval$method,levels=c("multi","mono","random","multiRewire"),labels=c("Multilinguals","\nMonolinguals","Random","\nMultilinguals\n(edges rewired)")) | |
dfNodeRemovalAvg$label<-factor(dfNodeRemovalAvg$method,levels=c("multi","mono","random","multiRewire"),labels=c("Multilinguals","\nMonolinguals","Random","\nMultilinguals\n(edges rewired)")) | |
#xscale<-scale_x_discrete("", | |
# breaks=c("multi","mono","multiRewire","random"), | |
# labels=c("Multilinguals","\nMonolinguals","\nMultilinguals\n(Edges rewired)","Random") | |
#) | |
xscale<-scale_x_discrete("") | |
cPlot<-ggplot(dfNodeRemoval,aes(x=label,y=components))+geom_boxplot()+theme_bw() | |
cPlot<-cPlot+xscale+scale_y_continuous("Number of components",labels=comma) | |
cPlot<-cPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=componentsMean),shape=3)#3=+, 4=* | |
lccPlot<-ggplot(dfNodeRemoval,aes(x=label,y=lccSize))+geom_boxplot()+theme_bw() | |
lccPlot<-lccPlot+xscale+scale_y_continuous("Size of LCC",labels=comma) | |
lccPlot<-lccPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=lccSizeMean),shape=3) | |
avgPlot<-ggplot(dfNodeRemoval,aes(x=label,y=avgSize))+geom_boxplot()+theme_bw() | |
avgPlot<-avgPlot+xscale+scale_y_continuous("Average component size (not including LCC)",labels=comma) | |
avgPlot<-avgPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=avgSizeMean),shape=3) | |
svg("nodeRemovalSimple.svg",width=12,height=5) | |
grid.arrange(lccPlot,cPlot,avgPlot,nrow=1,ncol=3) | |
dev.off() | |
#Addl -- compare % left in LCC to having removed all speakers of a given language (dfRemoveLangs) | |
nodesToRemove<-length(V(gMin5LCCUndirected)[V(gMin5LCCUndirected)$majLangPercentAdj<1]) | |
dfNodeRemovalAvg$lccPercent<-1-(dfNodeRemovalAvg$lccSizeMean/(length(V(gMin5LCC))-nodesToRemove)) | |
dfNodeRemovalAvg[,c("method","lccPercent")] | |
dfRemoveLangs[dfRemoveLangs=="en","lccFrac"] | |
############################### | |
## Language analysis - remove nodes from one language at a time | |
############################## | |
#\label{tbl:topLangs} | |
#TOP Langs: Table tbl:topLangs | |
sort(table(V(gMin5LCCUndirected)$majLangAdj)) | |
langs<-c("en","ja","pt","ms","es","nl","ko","th") | |
#Avg. tweets per lang | |
dfTmp<-data.frame(majLangAdj=V(gMin5LCCUndirected)$majLangAdj,tweetCount=V(gMin5LCCUndirected)$tweetCount) | |
dfTmp<-ddply(dfTmp,.(majLangAdj),summarize,meanTweets=round(mean(tweetCount),2),tweetSD=round(sd(tweetCount),2),tweetMin=min(tweetCount),tweetMax=max(tweetCount),tweetMed=median(tweetCount),sumTweets=sum(tweetCount),numUsers=length(tweetCount)) | |
dfTmp[order(dfTmp$numUsers,decreasing=TRUE),c("majLangAdj","numUsers","meanTweets","tweetSD","tweetMin","tweetMax","tweetMed")] | |
print(dfTmp[order(dfTmp$numUsers,decreasing=TRUE),c("majLangAdj","numUsers","meanTweets","tweetSD")],row.names=FALSE) | |
GRAPHSIZE<-length(V(gMin5LCCUndirected)) | |
vals<-seq(0.01,0.3,0.01) | |
N <- length(vals)*(length(langs)+1) | |
if (NUM.CORES>1) { | |
cl <- makeCluster(NUM.CORES) | |
registerDoParallel(cl, cores = NUM.CORES) | |
} | |
dfLang100 <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"), | |
.combine = rbind) %dopar% { | |
dfLang <- data.frame(val=rep(NA, N), components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), lang=rep("", N),stringsAsFactors=FALSE) | |
i<-0 | |
print(paste0("Run: ",run)) | |
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random") | |
V(gMin5LCCUndirected)$majLangPercentRankRev<-GRAPHSIZE-V(gMin5LCCUndirected)$majLangPercentRank | |
for (lang in langs) { | |
ranks<-sort(V(gMin5LCCUndirected)$majLangPercentRankRev[V(gMin5LCCUndirected)$majLangAdj==lang]) | |
for (x in vals) { | |
print(paste(lang,x)) | |
y<-x*GRAPHSIZE | |
y<-ranks[min(y,length(ranks))] | |
tmp<-induced.subgraph(gMin5LCCUndirected, | |
V(gMin5LCCUndirected)$majLangAdj!=lang | V(gMin5LCCUndirected)$majLangPercentRankRev>=y) | |
#print(summary(tmp)) | |
components<-clusters(tmp,mode="weak") | |
#print(components$no) | |
m<-max(components$csize) | |
i<-i+1 | |
dfLang[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), lang) | |
} | |
} | |
#Randomly permute majLangPercentRev and repeat the above | |
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRankRev) | |
for (x in vals) { | |
print(paste("random",x)) | |
y<-x*GRAPHSIZE | |
tmp<-induced.subgraph(gMin5LCCUndirected, | |
V(gMin5LCCUndirected)$permuted>=y) | |
#print(summary(tmp)) | |
components<-clusters(tmp,mode="weak") | |
#print(components$no) | |
m<-max(components$csize) | |
i<-i+1 | |
dfLang[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), "random") | |
} | |
write.csv(dfLang,paste0("dfLang_run_",run,".csv")) | |
return(dfLang) | |
} | |
dfLang<-dfLang100 | |
dfLang$val<-as.numeric(dfLang$val) | |
dfLang$components<-as.numeric(dfLang$components) | |
dfLang$lccSize<-as.numeric(dfLang$lccSize) | |
dfLang$avgSize<-as.numeric(dfLang$avgSize) | |
write.csv(dfLang,"dfLangAllRuns.csv") | |
dfLang<-ddply(dfLang,.(val,lang),summarize, | |
components=mean(components),lccSize=mean(lccSize), | |
avgSize=mean(avgSize)) | |
write.csv(dfLang,"dfLang_avg100.csv") | |
warnings() | |
if (NUM.CORES>1) { | |
stopCluster(cl) | |
} | |
#scale_color_brewer(type="qual") | |
dfLang<-subset(dfLang,lang %in% langs) | |
dfLang$lang[dfLang$lang=="random"]<-"Random" | |
svg("lang_no_components.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfLang,aes(x=val,y=components,color=lang,group=lang)) | |
plot<-plot + geom_point() + geom_line(size=1) + scale_color_discrete("Language")#,type="qual") | |
plot<-plot + scale_x_continuous('% of users removed',labels=percent) | |
plot<-plot + scale_y_continuous("Number of components",labels=comma) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot<-plot + geom_text(aes(x=.31,y=components,color=lang,label=lang),data=subset(dfLang,val==0.3)) | |
plot | |
dev.off() | |
dfEnding<-subset(dfLang,val==.3) | |
dfEnding[order(dfEnding$components), ] | |
svg("lang_lccSize.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfLang,aes(x=val,y=lccSize,color=lang,group=lang)) | |
plot<-plot + geom_point() + geom_line(size=1) + scale_color_discrete("Language")#,type="qual") | |
plot<-plot + scale_x_continuous('% of users removed',labels=percent) | |
plot<-plot + scale_y_continuous("Size of largest connected component",labels=comma) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16)) | |
plot | |
dev.off() | |
#TODO: Look at direct.label | |
table(dfLang$lang[dfLang$val==.3],dfLang$components[dfLang$val==.3]) | |
#Ending num. components for each language (val==.3) | |
#en 48,190 | |
#es 1,923 | |
#in 6,864 | |
#ja 385 | |
#ko 184 | |
#ms 8,626 | |
#nl 965 | |
#pt 2,211 | |
#random 67,206 | |
V(gMin5LCCUndirected)$constraint<-constraint(gMin5LCCUndirected) | |
dfGraph<-data.frame( | |
majLangPercentAdj=V(gMin5LCCUndirected)$majLangPercentAdj, | |
tweetCount=V(gMin5LCCUndirected)$tweetCountAdj, | |
inDegree=V(gMin5LCCUndirected)$inDegree, | |
constraint=V(gMin5LCCUndirected)$constraint | |
) | |
#More graphs | |
png("tweetCount_majLangPercent.png",width=pngWidth,height=pngHeight) | |
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=tweetCount)) + geom_point() | |
plot | |
dev.off() | |
png("inDegree_majLangPercent.png",width=pngWidth,height=pngHeight) | |
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=inDegree)) + geom_point() | |
plot | |
dev.off() | |
png("constraint_majLangPercent.png",width=pngWidth,height=pngHeight) | |
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=constraint)) + geom_point() | |
plot | |
dev.off() | |
V(gMin5LCCUndirected)$label[V(gMin5LCCUndirected)$inDegree>40000]#justinbieber | |
V(gMin5LCCUndirected)$label[V(gMin5LCCUndirected)$inDegree>10000]#SlGNO | |
#Remove nodes from lang and calculate fraction of reminaing nodes in LCC | |
langs<-as.data.frame(table(V(gMin5LCCUndirected)$majLangAdj)) | |
langs<-subset(langs,langs$Freq>1000)$Var1 | |
N<-length(langs) | |
dfRemoveLangs<-data.frame(lang=rep("",N),components=rep(NA,N),lccSize=rep(NA,N), | |
numNodes=rep(NA,N),num2=rep(NA,N),stringsAsFactors=FALSE) | |
i<-0 | |
for (lang in langs) { | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangAdj!=lang) | |
components<-clusters(tmp,mode="weak") | |
m<-max(components$csize) | |
i<-i+1 | |
dfRemoveLangs[i, ] <- list(lang, components$no, m, length(V(tmp)), sum(components$csize[components$csize!=m])) | |
} | |
dfRemoveLangs$numNodes==dfRemoveLangs$num2+dfRemoveLangs$lccSize | |
dfRemoveLangs$lccFrac<-(dfRemoveLangs$numNodes-dfRemoveLangs$lccSize)/dfRemoveLangs$numNodes | |
svg("lang_removed.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfRemoveLangs,aes(x=lang,y=lccFrac)) | |
plot<-plot + geom_bar(stat="identity") | |
plot<-plot + scale_y_continuous('% of nodes not in largest-connected component',labels=percent) | |
plot<-plot + scale_x_discrete("Language removed") | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangAdj!="ms" & V(gMin5LCCUndirected)$majLangAdj!="in") | |
components<-clusters(tmp,mode="weak") | |
m<-max(components$csize) | |
(length(V(tmp))-m)/length(V(tmp)) | |
################################################### | |
## Q1: Is language homophilous? ## | |
################################################### | |
#take largest *strongly* connected component? | |
#labelPropComm<-label.propagation.community(gMin5LCCUndirected) | |
#write.csv(labelPropComm$membership,"labelPropCommMembership.csv") | |
commsFile<-"iter72memberships_smart.txt" #This file comes from the label propagation code in Java | |
if (!is.na(commsFile)){ | |
memberships<-read.csv(commsFile,header=FALSE,sep=" ") | |
#Begin Untested code | |
tmp<-read.csv("usernames.txt",header=FALSE) | |
sum(tmp!=V(gMin5LCCUndirected)$label)#This should be 0, else the lists are malaligned! | |
#End untested code | |
modularity(gMin5LCCUndirected,memberships$V2) | |
#[1] 0.7471879 | |
#in-ms: 0.8054294 | |
N <- max(memberships$V2) | |
dfComms <- data.frame(num=seq(1, N), size=rep(0, N), numLangs=rep(0, N), majLangCount=rep(0,N), majLang=rep("", N),stringsAsFactors=FALSE) | |
for (comm in seq(1,N)) { | |
size<-length(memberships$V2[memberships$V2==comm]) | |
langs<-V(gMin5LCCUndirected)$majLangAdj[memberships$V2==comm] | |
numLangs<-length(table(langs)) | |
majLang <- unique(langs) | |
majLang <- majLang[which.max(tabulate(match(langs,majLang)))] | |
majLangCount<-sum(langs==majLang) | |
dfComms[comm, ] <- list(comm, size, numLangs, majLangCount, majLang) | |
} | |
write.csv(dfComms,"dfCommunities.csv") | |
#dfComms<-read.csv("community_info_iter36.csv") | |
#dfComms$label<-dfComms$label+1 | |
sum(dfComms$numLangs==1) | |
sum(dfComms$numLangs==1)/length(dfComms$numLangs) #A large number of components have one language only | |
sum(dfComms$size[dfComms$numLangs==1])/sum(dfComms$size) #But these components represent a small number of users | |
#Five large components have 61% of users, #\label{tbl:topCommunities} | |
sum(dfComms$size[dfComms$size>10^4])/sum(dfComms$size) | |
dfComms[dfComms$size>10^4, ] | |
top<-dfComms[dfComms$size>10^4, ] | |
top$majLangPercent<-round((top$majLangCount/top$size)*100,1) | |
top[order(top$size,decreasing=TRUE), c("majLang","majLangPercent","numLangs","size")] | |
head(dfComms[order(dfComms$size,decreasing=TRUE), c("majLang","majLangPercent","numLangs","size")],n=25) | |
sort(table(dfComms$majLang)) | |
#Is 48 the total number of langs in the network? | |
length(unique(V(gMin5LCCUndirected)$majLangAdj)) | |
dfComms$majLangPercent<-dfComms$majLangCount/dfComms$size | |
summary(dfComms) | |
with(dfComms,plot(log(size),majLangPercent)) | |
max(dfComms$size) | |
dfComms$label[dfComms$size==max(dfComms$size)] | |
#label 16 is quite large | |
svg("communities_size_percent.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,color=majLang)) | |
plot<-plot+geom_point() | |
plot<- plot + scale_x_log10("Cluster size", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
svg("communities_size_percent_labels.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,label=majLang)) | |
plot<-plot+geom_text() | |
plot<- plot + scale_x_log10("Cluster size", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
svg("communities_size_percent_labels_partial.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,color=majLang)) | |
plot<-plot+geom_point() | |
plot<-plot+geom_text(data=subset(dfComms,size>10^4),aes(x=size,y=majLangPercent+0.025,label=majLang)) | |
plot<- plot + scale_x_log10("Cluster size", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
dfComms[dfComms$size>10^4, ] | |
#hist(log(dfComms$size)) | |
svg("community_size.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfComms,aes(x=size)) | |
plot<-plot+geom_histogram(binwidth=0.15) | |
plot<- plot + scale_x_log10("Cluster size", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot<-plot + scale_y_continuous("Frequency",labels=comma) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
svg("community_numLangs_hist.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfComms,aes(x=numLangs)) | |
plot<-plot+geom_histogram(binwidth=1) | |
plot<- plot + scale_x_continuous("Number of languages per cluster") | |
#plot<-plot + scale_y_continuous("Percent of users in communities with this number of languages or fewer (cdf)",labels=percent) | |
plot<-plot + scale_y_continuous("Frequency",labels=comma) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
svg("community_numLangs_hist_zoom.svg",width=svgWidth,height=svgHeight) | |
zplot<-ggplot(subset(dfComms,numLangs<=5),aes(x=as.factor(numLangs))) | |
zplot<-zplot+geom_histogram(binwidth=1) | |
zplot<-zplot + scale_x_discrete("Number of languages per cluster") | |
zplot<-zplot + scale_y_continuous("Frequency",labels=comma) | |
zplot<-zplot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
zplot | |
dev.off() | |
#dfDensity<-densityDataframe(dfComms,"numLangs",NA,n=2048,adjust=8) | |
#dplot <- ggplot(dfDensity,aes(x=x10,y=y)) + geom_path() | |
dfTmp<-as.data.frame(table(dfComms$numLangs)) | |
insert<-ggplot(dfTmp,aes(x=as.numeric(Var1),y=Freq))+geom_point()+ | |
scale_y_log10("", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x)))+ | |
scale_x_continuous("")+ theme_bw() | |
svg("community_numLangs_hist_insert.svg",width=svgWidth,height=svgHeight) | |
print(zplot) | |
#print(plot+scale_x_continuous("")+scale_y_continuous("",labels=comma), | |
print(insert, | |
vp=viewport(width=0.6,height=0.6,x=1,y=1,just=c("right","top"))) | |
dev.off() | |
N <- max(memberships$V2) | |
if (NUM.CORES>1) { | |
cl <- makeCluster(NUM.CORES) | |
registerDoParallel(cl, cores = NUM.CORES) | |
} | |
dfCommsShuffled100 <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"), | |
.combine = rbind) %dopar% { | |
dfCommsShuffled <- data.frame(num=seq(1, N), size=rep(0, N), numLangs=rep(0, N), majLangCount=rep(0,N), majLang=rep("", N),stringsAsFactors=FALSE) | |
print(paste0("Run: ",run)) | |
V(gMin5LCCUndirected)$majLangAdjShuffled<-sample(V(gMin5LCCUndirected)$majLangAdj)#Random permutation | |
for (comm in seq(1,N)) { | |
size<-length(memberships$V2[memberships$V2==comm]) | |
langs<-V(gMin5LCCUndirected)$majLangAdjShuffled[memberships$V2==comm] | |
numLangs<-length(table(langs)) | |
majLang <- unique(langs) | |
majLang <- majLang[which.max(tabulate(match(langs,majLang)))] | |
majLangCount<-sum(langs==majLang) | |
dfCommsShuffled[comm, ] <- list(comm, size, numLangs, majLangCount, majLang) | |
} | |
write.csv(dfCommsShuffled,paste0("dfCommsShuffled_run_",run,".csv")) | |
return(dfCommsShuffled) | |
} | |
#Collapse dfComms | |
write.csv(dfCommsShuffled100,"dfCommsShuffled.csv") | |
dfCommsShuffled<-ddply(dfCommsShuffled100,.(num),summarize, | |
size=mean(size),sizeSD=sd(size), | |
numLangs=mean(numLangs),numLangsSD=sd(numLangs), | |
majLangCount=mean(majLangCount),majLangCountSD=sd(majLangCount), | |
majLang="skip") | |
dfCommsShuffled$majLangPercent<-dfCommsShuffled$majLangCount/dfCommsShuffled$size | |
warnings() | |
if (NUM.CORES>1) { | |
stopCluster(cl) | |
} | |
svg("community_numLangs_hist_shuffled.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfCommsShuffled,aes(x=numLangs)) | |
plot<-plot+geom_histogram(binwidth=1) | |
plot<- plot + scale_x_continuous("Number of languages per community (when language labels are randomly shuffled)") | |
plot<-plot + scale_y_continuous("Frequency",labels=comma) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
summary(dfComms$numLangs) | |
summary(dfCommsShuffled$numLangs) | |
length(dfComms$numLangs[dfComms$numLangs==1]) | |
length(dfCommsShuffled$numLangs[dfCommsShuffled$numLangs==1]) | |
dfComms$shuffled<-"Observed data" | |
dfCommsShuffled$shuffled<-"Language labels shuffled" | |
dfTmp<-rbind(dfComms[,c("size","numLangs","majLangCount","shuffled")],dfCommsShuffled[,c("size","numLangs","majLangCount","shuffled")]) | |
dfDensity<-densityDataframe(dfTmp,"numLangs","shuffled",n=2048,adjust=6) | |
dfDensitySum<-densitySummary(dfTmp,"numLangs","shuffled") | |
pNumLangs<-densityPlot(dfDensity,dfDensitySum,"Number of languages per community") | |
svg("community_numLangs_density.svg",width=svgWidth,height=svgHeight) | |
pNumLangs | |
dev.off() | |
#difference of means t-test | |
t.test(x=dfComms$numLangs,y=dfCommsShuffled$numLangs,alternative="two.sided") | |
# p-value < 2.2e-16 | |
} | |
#spinglassComm<-spinglass.community(gMin5LCCUndirected, weights=NA) | |
#write.csv(spinglassComm$membership,"spinglassCommMembership.csv") | |
modularity(gMin5LCCUndirected,as.factor(V(gMin5LCCUndirected)$majLangAdj)) | |
#[1] 0.6489853 ge2? | |
#[1] 0.6291782 ge4_ge2-20 | |
#[1] 0.6671592 ge4_ge2-20_inms | |
################################################### | |
## Q3: Lang outwardness ## | |
################################################### | |
#TODO: Include all languages???? | |
sort(table(V(gMin5LCCUndirected)$majLangAdj)) | |
#langList<-sort(c("in","es","ms","pt","ja","en","ru","de","tr","it","fil","fr","ar","th","ko","nl")) | |
#gLangs<-induced.subgraph(gMin5LCC,V(gMin5LCC)$majLangAdj %in% langList) | |
langList<-sort(unique(V(gMin5LCCUndirected)$majLangAdj))#Must be in same order as langSizes!!! | |
gLangs<-gMin5LCC | |
sort(table(V(gLangs)$majLangAdj)) | |
#langs <- unique(V(gLangs)$majLangAdj) | |
langSizes<-table(V(gLangs)$majLangAdj) | |
totalNodes<-length(V(gLangs)) #Check if this needs $something? | |
totalEdges<-length(E(gLangs)) #Check if this needs $weight? | |
#Check distribution of majLangPercentAdj for different languages | |
library(plyr) | |
dfMultilingual<-data.frame(lang=V(gMin5LCCUndirected)$majLangAdj,percent=V(gMin5LCCUndirected)$majLangPercentAdj) | |
dfMultilingual$lang<-as.factor(dfMultilingual$lang) | |
dfMultiCollapse<-ddply(dfMultilingual,.(lang),function(df) {c(length(df$percent),min(df$percent),max(df$percent),mean(df$percent),median(df$percent),length(df$percent[df$percent!=1]))}) | |
names(dfMultiCollapse)<-c("lang","nodeCount","min","max","mean","median","multilingualCount") | |
dfMultiCollapse$multiPercent<-dfMultiCollapse$multilingualCount/dfMultiCollapse$nodeCount | |
largeLangs<-as.character(dfMultiCollapse$lang[dfMultiCollapse$nodeCount>1000]) | |
plot(log(dfMultiCollapse$nodeCount),dfMultiCollapse$multiPercent) | |
dfMultiCollapse[order(dfMultiCollapse$nodeCount),c("lang","nodeCount","multiPercent")] | |
dfMultiCollapseSub<-subset(dfMultiCollapse,nodeCount>1000) | |
with(dfMultiCollapseSub,plot(log(nodeCount),multiPercent)) | |
svg("multilingualism_ge1000.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(subset(dfMultiCollapse,nodeCount>1000),aes(x=nodeCount,y=multiPercent)) | |
plot<-plot+geom_point() | |
plot<- plot + scale_x_log10("Language size (log of number of users)", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
svg("multilingualism.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent)) | |
plot<-plot+geom_point() | |
plot<- plot + scale_x_log10("Language size (log of number of users)", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
svg("multilingualism-labels.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent,label=lang)) | |
plot<-plot+geom_text()#Use size= here to control size | |
plot<- plot + scale_x_log10("Language size (number of users)", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot | |
dev.off() | |
svg("multilingualism-labels_and_point.svg",width=svgWidth,height=svgHeight) | |
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent,label=lang)) + geom_point() | |
plot<-plot+geom_text(aes(y=multiPercent+0.02))#Use size= here to control size | |
plot<- plot + scale_x_log10("Language size (log of number of users)", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent) | |
plot<-plot + theme_bw() + | |
theme(legend.title=element_text(size=18),legend.text=element_text(size=16), | |
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16), | |
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none") | |
plot#+stat_smooth(method="loess") | |
#method="lm","gam"/library(mgcv) (http://www.inside-r.org/r-doc/mgcv/gam) | |
dev.off() | |
#Correlation between size/mutlilingualism? | |
cor(dfMultiCollapse$nodeCount,dfMultiCollapse$multiPercent) | |
with(dfMultiCollapse,cor(log(nodeCount),multiPercent)) | |
with(subset(dfMultiCollapse,nodeCount<=1000),cor(log(nodeCount),multiPercent)) | |
with(subset(dfMultiCollapse,nodeCount<=10^5),cor(log(nodeCount),multiPercent)) | |
coef(lm(log(nodeCount) ~ multiPercent, data = dfMultiCollapse)) | |
cor.test(dfMultiCollapse$nodeCount,dfMultiCollapse$multiPercent,alternative="less",conf.level=0.95) | |
cor.test(log(dfMultiCollapse$nodeCount),dfMultiCollapse$multiPercent,alternative="less",conf.level=0.95) | |
# method = c("pearson", "kendall", "spearman") | |
#dfMultiCollapseSub[order(dfMultiCollapseSub$multiPercent),c("lang","nodeCount","multiPercent")] | |
rm(dfMultiCollapseSub) | |
dfMultiCollapse[order(dfMultiCollapse$multiPercent),c("lang","nodeCount","multiPercent")] | |
dfMultiCollapse[order(dfMultiCollapse$nodeCount),c("lang","nodeCount","multiPercent")] | |
N<-length(langList) | |
dfOut <- data.frame(lang=rep("", N), nodeCount=rep(0, N), edgeCount=rep(0, N), | |
inLangEdges=rep(0, N),expInLangEdges=rep(0, N), | |
stringsAsFactors=FALSE) | |
i<-0 | |
for (lang in langList) { | |
#langEdges<-length( | |
# E(gLangs)[E(gLangs)$source %in% | |
# V(gLangs)[V(gLangs)$majLangAdj==lang] | |
# ]) | |
langEdges<-sum(degree(gLangs,V(gLangs)$majLangAdj==lang,mode="out")) | |
inGroupEdges<- | |
length(E(induced.subgraph(gLangs,V(gLangs)$majLangAdj==lang))) | |
#bwGroupEdges<-langEdges-inGroupEdges | |
expInGroupEdges<-(as.numeric(langSizes[lang])/totalNodes)*langEdges | |
i<-i+1 | |
dfOut[i, ] <- list(lang,as.numeric(langSizes[lang]),langEdges,inGroupEdges,expInGroupEdges) | |
} | |
#dfOut$nodeCount<-as.numeric(dfOut$nodeCount) | |
#dfOut$edgeCount<-as.numeric(dfOut$edgeCount) | |
#dfOut$inLangEdges<-as.numeric(dfOut$inLangEdges) | |
#dfOut$expInLangEdges<-as.numeric(dfOut$expInLangEdges) | |
dfOut$diff<-dfOut$inLangEdges-dfOut$expInLangEdges | |
dfOut$fracIn<-dfOut$inLangEdges/dfOut$edgeCount | |
dfOut$expFrac<-dfOut$expInLangEdges/dfOut$edgeCount | |
dfOut$nodeFrac<-dfOut$nodeCount/totalNodes | |
dfOut$pError<-dfOut$diff/dfOut$expInLangEdges | |
dfOut$diffPercent<-abs(dfOut$diff)/((dfOut$inLangEdges+dfOut$expInLangEdges)/2) | |
dfOut$logFracIn<-log(dfOut$inLangEdges)/log(dfOut$edgeCount) | |
dfOut$zscore<-(dfOut$inLangEdges-mean(dfOut$inLangEdges))/sd(dfOut$inLangEdges) | |
write.csv(dfOut,"dfOut_directed.csv") | |
with(subset(dfOut,fracIn>0.8),plot(log(nodeCount),fracIn)) | |
source("../tweet-langs-inms.R") | |
dfTweetLangs<-tweetLangAll() | |
dfTweetLangs<-subset(dfTweetLangs,lang %in% langList) | |
totalTweets<-sum(dfTweetLangs$count) | |
dfTweetLangs$percent<-dfTweetLangs$count/totalTweets | |
dfOut$tweetCount<-NA | |
dfOut$tweetPercent<-NA | |
for (lang in langList) { | |
dfOut$tweetCount[dfOut$lang==lang]<-dfTweetLangs$count[dfTweetLangs$lang==lang] | |
dfOut$tweetPercent[dfOut$lang==lang]<-dfTweetLangs$percent[dfTweetLangs$lang==lang] | |
} | |
dfOut$tweetExpEdges<-dfOut$tweetPercent*dfOut$edgeCount | |
dfOut$tweetdiff<-dfOut$inLangEdges-dfOut$tweetExpEdges | |
#Table \label{tbl:inwardness} | |
dfSum<-subset(dfOut,nodeCount>1000)[ , c("lang","nodeFrac","fracIn") ] | |
dfSum$nodeFrac<-round(dfSum$nodeFrac*100,2) | |
dfSum$fracIn<-round(dfSum$fracIn*100,2) | |
dfSum[order(dfSum$fracIn, decreasing=TRUE), ] | |
ggplot(dfSum,aes(label=lang,x=nodeFrac,y=fracIn))+geom_text() | |
#ident <- function(x) {(x/totalNodes)*totalEdges} | |
# | |
plot <- ggplot(dfOut,aes(x=nodeCount,y=fracIn)) + geom_point() | |
#plot <- plot + stat_function(fun = ident, linetype="dashed") | |
plot <- plot + scale_x_log10("Language size (log of number of users)", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot | |
#ident <- function(x) {x} | |
plot <- ggplot(dfOut,aes(x=nodeCount,y=diff)) + geom_point() | |
#plot <- plot + stat_function(fun = ident, linetype="dashed") | |
plot <- plot + stat_smooth(method="lm", se=TRUE) | |
plot <- plot + scale_x_log10("Language size (log of number of users)", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot <- plot + scale_y_log10("Difference from expected (log)", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
plot | |
dfA<-data.frame(lang=rep("", N),var=rep("", N),val=rep(0, N)) | |
dfA$lang<-dfOut$lang | |
dfA$var<-"fracIn" | |
dfA$val<-dfOut$fracIn | |
dfB<-data.frame(lang=rep("", N),var=rep("", N),val=rep(0, N)) | |
dfB$lang<-dfOut$lang | |
dfB$var<-"fracExp" | |
dfB$val<-dfOut$expFrac | |
dfTrans<-rbind(dfA,dfB) | |
rm(dfA) | |
rm(dfB) | |
dfTrans$nodeCount<-dfOut$nodeCount | |
plot <- ggplot(dfTrans,aes(x=nodeCount,y=val,group=var,color=var)) + geom_point() | |
plot <- plot + scale_x_log10("Language size (log of number of users)", | |
breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) | |
#plot <- plot + scale_y_log10("Difference from expected (log)", | |
# breaks = trans_breaks("log10", function(x) 10^x), | |
# labels = trans_format("log10", math_format(10^.x))) | |
plot | |
#E(gLangs)$sourceLang<-"" | |
#E(gLangs)$targetLang<-"" | |
#edges<-get.edges(gLangs, E(gLangs)) | |
#edgeLangs<-data.frame(source=rep("",length(edges)), target=rep("",length(edges)),stringsAsFactors=FALSE) | |
#i<-0 | |
#for (edge in edges) { | |
# s<-edges[edge,1] | |
# t<-edges[edge,2] | |
# sLang<-V(gLangs)[s]$majLangAdj | |
# tLang<-V(gLangs)[t]$majLangAdj | |
# i<-i+1 | |
# edgeLangs[i, ] <- c(sLang,tLang) | |
#} | |
#gCollapsed<-contract.vertices(gLangs, which(langList==V(gLangs)$majLangAdj)) | |
levels<-as.factor(V(gLangs)$majLangAdj) | |
edges<-get.edges(gLangs, E(gLangs)) | |
E(gLangs)$source<-edges[,1] | |
E(gLangs)$target<-edges[,2] | |
gCollapsed<-contract.vertices(gLangs, levels,vertex.attr.comb="first") | |
try({ | |
gCollapsed<-remove.edge.attribute(gCollapsed, "sourceLang") | |
gCollapsed<-remove.edge.attribute(gCollapsed, "targetLang") | |
}) | |
gCollapsed<-simplify(gCollapsed,remove.loops=TRUE,remove.multiple=FALSE) | |
E(gCollapsed)$weight<-1 | |
gCollapsed<-simplify(gCollapsed,remove.multiple=TRUE,edge.attr.comb=list(weight="sum",source=function(x) sum(!duplicated(x)),target=function(x) sum(!duplicated(x)))) | |
summary(gCollapsed) | |
V(gCollapsed)$label<-V(gCollapsed)$majLangAdj | |
V(gCollapsed)$name<-V(gCollapsed)$majLangAdj | |
V(gCollapsed)$langSize<-0 | |
for (index in V(gCollapsed)) { | |
l<-V(gCollapsed)[index]$majLangAdj | |
V(gCollapsed)[index]$langSize<-langSizes[which(langList==l)] | |
} | |
V(gCollapsed)$langSizeLog<-log(V(gCollapsed)$langSize) | |
#New insertion 2013-12-19################### | |
#(graph without normalization / expected a la Wikipedia article (ultimately not used in published paper, but retained for reference) | |
E(gCollapsed)$mentionCount<-E(gCollapsed)$weight | |
E(gCollapsed)$weightLog<-log(E(gCollapsed)$source) | |
weightLogSD<-sd(E(gCollapsed)$weightLog) | |
weightLogMean<-mean(E(gCollapsed)$weightLog) | |
print(paste0("Mean: ",weightLogMean)) | |
print(paste0("95% Mark: ",weightLogMean+(1.96*weightLogSD))) | |
#Add some columns for use in visualization | |
#Random bug, igraph doesn't write boolean values to graph output files; so, use numbers | |
E(gCollapsed)$weightLog_geMean<-ifelse(E(gCollapsed)$weightLog>=weightLogMean,1,0) | |
E(gCollapsed)$weightLog_ge1SD<-ifelse(E(gCollapsed)$weightLog>=weightLogMean+weightLogSD,1,0) | |
E(gCollapsed)$weightLog_ge95<-ifelse(E(gCollapsed)$weightLog>=weightLogMean+(1.96*weightLogSD),1,0) | |
E(gCollapsed)$weightLog_diffMean<-E(gCollapsed)$weightLog-weightLogMean | |
#Repeat with percent of users in source lang mention users in target lang? | |
edges<-get.edges(gCollapsed, E(gCollapsed)) | |
E(gCollapsed)$weightPercent<-0 | |
for (edge in E(gCollapsed)) { | |
s=V(gCollapsed)[edges[[edge,1]]]$name | |
E(gCollapsed)[edge]$weightPercent<-E(gCollapsed)[edge]$source/langSizes[which(langList==s)] | |
} | |
E(gCollapsed)$weight<-E(gCollapsed)$weightLog | |
write.graph(gCollapsed,"gCollasped_nonnormalized_usercounts.graphml",format="graphml") | |
#List top edges by weightlog (but give actual weight) | |
tmp<-get.edgelist(gCollapsed) | |
tmp<-cbind(as.data.frame(tmp),E(gCollapsed)$weightLog,E(gCollapsed)$source,E(gCollapsed)$weightPercent) | |
names(tmp)<-c("source","target","weightLog","weight","weightPercent") | |
#head(tmp[order(tmp$weight,decreasing=TRUE), ],n=25) | |
head(tmp[order(tmp$weight,decreasing=TRUE), c("source","target","weight","weightPercent")],n=7) | |
#Filter to ge95 and drop isolates | |
gTmp<-gCollapsed | |
gTmp<-delete.edges(gTmp,E(gTmp)[E(gTmp)$weightLog_ge95!=1]) | |
V(gTmp)$degree<-degree(gTmp) | |
gTmp<-delete.vertices(gTmp,V(gTmp)$degree==0) | |
#Repeat everything above with English removed? | |
################End insertion 2013-12-19################################## | |
#Add missing edges with weight of 0 | |
edgelist<-as.data.frame(get.edgelist(gCollapsed,names=TRUE)) | |
names(edgelist)=c("source","target") | |
edgelist$exists<-TRUE | |
N<-length(langList)*(length(langList)-1) | |
dfEdgesFull<-data.frame(source=rep("",N),target=rep("",N),stringsAsFactors=FALSE) | |
i<-0 | |
for (source in langList) { | |
for (target in langList) { | |
if (target!=source) { | |
i<-i+1 | |
dfEdgesFull[i, ]<-c(source,target) | |
} | |
} | |
} | |
dfEdgesFull<-merge(x=dfEdgesFull,y=edgelist,all=TRUE) | |
missing<-dfEdgesFull[is.na(dfEdgesFull$exists), ] | |
missing<-data.frame(source=missing$source,target=missing$target) | |
medges<-c() | |
for (index in seq(1,length(missing$source))) { | |
medges<-c(medges,as.character(missing[index, "source"]), | |
as.character(missing[index, "target"]) | |
) | |
} | |
gCollapsed<-add.edges(gCollapsed,medges,attr=list(weight=0)) | |
summary(gCollapsed) | |
#V(gCollapsed)$outDegree<-degree(gCollapsed,V(gCollapsed),mode="out") #Not out-degree, sum of weight on out edges | |
V(gCollapsed)$outWeight<-0 | |
E(gCollapsed)$expected<-NA | |
edges<-get.edges(gCollapsed, E(gCollapsed)) | |
for (edge in seq(1,length(edges)/2)) { | |
s<-edges[edge,1] | |
V(gCollapsed)[s]$outWeight<-V(gCollapsed)[s]$outWeight + E(gCollapsed)[edge]$weight | |
} | |
for (edge in seq(1,length(edges)/2)) { | |
s<-edges[edge,1] | |
t<-edges[edge,2] | |
sLang<-V(gCollapsed)[s]$majLangAdj | |
tLang<-V(gCollapsed)[t]$majLangAdj | |
#TODO: Demoninator needs to be reduced by the size of the source language!!! | |
#sLangSize<-langSizes[which(langList==sLang)] | |
#E(gCollapsed)[edge]$expected<-(langSizes[which(langList==tLang)]/(totalNodes-sLangSize)) * V(gCollapsed)[s]$outWeight | |
E(gCollapsed)[edge]$expected<-(langSizes[which(langList==tLang)]/totalNodes) * V(gCollapsed)[s]$outWeight | |
} | |
summary(E(gCollapsed)$weight) | |
summary(E(gCollapsed)$expected) | |
E(gCollapsed)$pError<-(E(gCollapsed)$weight-E(gCollapsed)$expected)/E(gCollapsed)$expected | |
E(gCollapsed)$pErrorZ<-(E(gCollapsed)$pError-mean(E(gCollapsed)$pError,na.rm=TRUE))/sd(E(gCollapsed)$pError,na.rm=TRUE) | |
E(gCollapsed)$diffPercent<-abs(E(gCollapsed)$weight-E(gCollapsed)$expected)/((E(gCollapsed)$weight+E(gCollapsed)$expected)/2) | |
E(gCollapsed)$zscore<-(E(gCollapsed)$weight-mean(E(gCollapsed)$weight))/sd(E(gCollapsed)$weight) | |
summary(E(gCollapsed)$pError) | |
summary(E(gCollapsed)$zscore) | |
write.graph(gCollapsed,"gCollapsed.graphml",format="graphml") | |
gCollapsedLargeLangs<-induced.subgraph(gCollapsed,V(gCollapsed)$majLangAdj %in% largeLangs) | |
E(gCollapsedLargeLangs)$pErrorZ<-(E(gCollapsedLargeLangs)$pError-mean(E(gCollapsedLargeLangs)$pError,na.rm=TRUE))/sd(E(gCollapsedLargeLangs)$pError,na.rm=TRUE) | |
E(gCollapsedLargeLangs)$zscore<-(E(gCollapsedLargeLangs)$weight-mean(E(gCollapsedLargeLangs)$weight))/sd(E(gCollapsedLargeLangs)$weight) | |
write.graph(gCollapsedLargeLangs,"gCollapsedLargeLangs.graphml",format="graphml") | |
edgeDataset<-function(graph) { | |
N<-length(E(graph)) | |
dfEdges<-data.frame(source=rep("",N),target=rep("",N),pError=rep(NA,N),zscore=rep(NA,N),pErrorZ=rep(NA,N),stringsAsFactors=FALSE) | |
edges<-get.edges(graph, E(graph)) | |
for (edge in seq(1,N)) { | |
s<-edges[edge,1] | |
t<-edges[edge,2] | |
sLang<-V(graph)[s]$majLangAdj | |
tLang<-V(graph)[t]$majLangAdj | |
pError<-E(graph)[edge]$pError | |
zscore<-E(graph)[edge]$zscore | |
pErrorZ<-E(graph)[edge]$pErrorZ | |
dfEdges[edge, ]<-list(sLang,tLang,pError,zscore,pErrorZ) | |
} | |
return(dfEdges) | |
} | |
#Table \label{tbl:langlang-more} | |
#head(dfEdges[order(dfEdges$pError,decreasing=TRUE),c("source","target","pError")],n=10) | |
dfEdgesLarge<-edgeDataset(gCollapsedLargeLangs) | |
dfEdgesLarge$pError100<-round(dfEdgesLarge$pError*100) | |
head(dfEdgesLarge[order(dfEdgesLarge$pError,decreasing=TRUE),c("source","target","pError100")],n=10) | |
#subset(dfEdges,pErrorZ>=1.96|pErrorZ<=1.96) | |
dfEdges<-edgeDataset(gCollapsed)#Is this really the same? Shouldn't it be different | |
dfEdgesSub<-subset(dfEdges,source%in%largeLangs & target%in%largeLangs) | |
head(dfEdgesSub[order(dfEdgesSub$pError,decreasing=TRUE),c("source","target","pError")],n=10) | |
rm(dfEdgesSub) | |
dfEdgesSub<-subset(dfEdges,zscore > 1.96| zscore < -1.96) | |
dfEdgesSub[order(dfEdgesSub$zscore,decreasing=TRUE),c("source","target","zscore")] | |
rm(dfEdgesSub) | |
dfEdges$pErrorZ<-(dfEdges$pError-mean(dfEdges$pError,na.rm=TRUE))/sd(dfEdges$pError,na.rm=TRUE) | |
dfEdgesSub<-subset(dfEdges,pErrorZ > 1.96 | pErrorZ < -1.96) | |
head(dfEdgesSub[order(dfEdgesSub$pErrorZ,decreasing=TRUE),c("source","target","pError","pErrorZ")],n=10) | |
rm(dfEdgesSub) | |
dfEdgesSub<-subset(dfEdges,source%in%largeLangs & target%in%largeLangs) | |
dfEdgesSub$pErrorZ<-(dfEdgesSub$pError-mean(dfEdgesSub$pError,na.rm=TRUE))/sd(dfEdgesSub$pError,na.rm=TRUE) | |
dfEdgesSub<-subset(dfEdgesSub,pErrorZ > 1.96 | pErrorZ < -1.96) | |
head(dfEdgesSub[order(dfEdgesSub$pErrorZ,decreasing=TRUE),c("source","target","pError","pErrorZ")],n=10) | |
rm(dfEdgesSub) | |
#N<-length(largeLangs)*(length(largeLangs)-1) | |
#dfEdgesFull<-data.frame(source=rep("",N),target=rep("",N),stringsAsFactors=FALSE) | |
#i<-0 | |
#for (source in largeLangs) { | |
# for (target in largeLangs) { | |
# if (target!=source) { | |
# i<-i+1 | |
# dfEdgesFull[i, ]<-c(source,target) | |
# } | |
# } | |
#} | |
# | |
# | |
#dfEdgesFull<-merge(x=dfEdgesFull,y=dfEdgesLarge,all=TRUE) | |
#dfEdgesFull[is.na(dfEdgesFull$pError), ] | |
# | |
##Any pairs with no edges in either direction? (\label{tbl:langlang-less}) | |
#tmp<-dfEdgesFull[is.na(dfEdgesFull$pError), c("source","target")] | |
#dis<-data.frame(source=tmp$source,target=tmp$target) | |
#dis2<-data.frame(source=tmp$target,target=tmp$source) | |
#reallyDis<-merge(x=dis,y=dis2,all=FALSE) | |
#reallyDis | |
#New version of R lists includes edges with weight 0, this results in pError of -1 | |
sum(E(gCollapsedLargeLangs)$weight==0)==sum(dfEdgesLarge$pError==-1) | |
sum(dfEdgesLarge$pError==-1) | |
#24 disconnected pairs | |
dfTmp<-dfEdgesLarge[dfEdgesLarge$pError==-1,c("source","target")] | |
dfTmp$mutual<-0 | |
for (i in seq(1,length(dfTmp$mutual))) { | |
s<-dfTmp[i,"source"] | |
t<-dfTmp[i,"target"] | |
if (sum(dfTmp[dfTmp$source==t, ]$target==s)==1) { | |
dfTmp[i,"mutual"]<-1 | |
} | |
} | |
missingTable<-function(df) { | |
str<-paste0(df$source," & ") | |
for (i in length(df$target)) { | |
if (df$mutual[i]) { | |
str<-paste0(str,"\\emphas{",df$target[i],"},") | |
} else { | |
str<-paste0(str,df$target[i],",") | |
} | |
} | |
print(str) | |
return(str) | |
} | |
ddply(dfTmp,.(source),missingTable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment