Skip to content

Instantly share code, notes, and snippets.

@computermacgyver
Last active January 3, 2016 05:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save computermacgyver/8416496 to your computer and use it in GitHub Desktop.
Save computermacgyver/8416496 to your computer and use it in GitHub Desktop.
Analysis of Twitter mentions/retweets network for #CHI2014 paper "Global Connectivity and Multilinguals in the Twitter Network". #R #igraph
#!/usr/bin/R
# Main analysis for "Global Connectivity and Multilinguals in the Twitter Network"
# paper. See http://www.scotthale.net/pubs/?chi2014 for details.
#
# Author: Scott A. Hale (http://www.scotthale.net/)
# License: GPLv2
# If you use this in support of an academic publication, please cite:
#
# Hale, S. A. (2014) Global Connectivity and Multilinguals in the Twitter Network.
# In Proceedings of the 2014 ACM Annual Conference on Human Factors in Computing Systems,
# ACM (Montreal, Canada).
#
# More details, related code, and the original academic paper using this code
# is available at http://www.scotthale.net/pubs/?chi2014
#
# Requires density_functions.R available at https://gist.github.com/computermacgyver/8416453
library(igraph)
library(ggplot2)
library(scales)
library(gridExtra)
library(plyr)
library(parallel)
library(foreach)
library(doParallel)
source("density_functions.R") #This file available at https://gist.github.com/computermacgyver/8416453
#if non-interactive and --save flag use the following to save the enviornment before quitting on an error
#options(error = quote(q("yes")))
NUM.CORES<-5
NUM.RUNS<-100
if (NUM.RUNS<NUM.CORES) {
NUM.CORES<-NUM.RUNS#Only use one core if we are running only once
}
#TODO: Consider average time between tweets -- remove lowest quartile as bots (?)
#TODO: Combine Indonesian and Malay together!
g<-read.graph("../../hadoop-twitter/graph_ge4_ge2-20_clean-msin_anon.graphml",format="graphml")
components<-decompose.graph(g,mode="weak", min.vertices = 100)
gLCC<-components[[1]]
rm(g)
pngWidth<-800
pngHeight<-700
svgWidth<-8
svgHeight<-8
summary(V(gLCC)$tweetCount)
# summary(V(gLCC)$tweetCount)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.000 1.000 2.000 3.615 4.000 343.000
#Filter to require at least one *in* edge
V(gLCC)$inDegree<-degree(gLCC, mode = "in")
summary(V(gLCC)$inDegree)
gMin5<-induced.subgraph(gLCC,which(V(gLCC)$inDegree>0 & V(gLCC)$tweetCountAdj>0 & V(gLCC)$tweetCount>=4))
components<-decompose.graph(gMin5,mode="weak", min.vertices = 100)
gMin5LCC<-components[[1]]
#rm(gLCC)
rm(gMin5)
#clusters<-edge.betweenness.community(gMin5LCC)
#sizes(clusters)
summary(gMin5LCC)
gMin5LCC<-simplify(gMin5LCC,remove.loops=TRUE,remove.multiple=TRUE,edge.attr.comb="sum")
summary(gMin5LCC)
gMin5LCCUndirected<-as.undirected(gMin5LCC, mode = "collapse")
summary(gMin5LCCUndirected)
summary(V(gMin5LCCUndirected)$majLangPercentAdj)
length(V(gMin5LCCUndirected)$majLangPercentAdj)
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj==1])
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj!=1])
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj!=1])/length(V(gMin5LCCUndirected)$majLangPercentAdj)
##############################
## Summary ##
###############################
write.graph(gMin5LCC,"edges.list",format="edgelist")
write(V(gMin5LCC)$majLangAdj,"langclean.txt")
write.csv(V(gMin5LCC)$label,"usernames.txt",row.names=FALSE)#Untested
V(gMin5LCC)$inDegree<-degree(gMin5LCC, mode = "in")
V(gMin5LCC)$outDegree<-degree(gMin5LCC, mode = "out")
V(gMin5LCC)$constraint<-constraint(gMin5LCC)
#V(gMin5LCC)$betweenness<-betweenness(gMin5LCC)
V(gMin5LCC)$multilingual[V(gMin5LCC)$majLangPercentAdj!=1]<-"Multilingual"
V(gMin5LCC)$multilingual[V(gMin5LCC)$majLangPercentAdj==1]<-"Monolingual"
#V(gMin5LCCUndirected)$multilingual<-as.factor(V(gMin5LCCUndirected)$multilingual)
table(V(gMin5LCC)$multilingual)
length(V(gMin5LCC)$multilingual)
sumStats<-data.frame(multilingual=V(gMin5LCC)$multilingual,
tweetCount=V(gMin5LCC)$tweetCountAdj,
inDegree=V(gMin5LCC)$inDegree,
outDegree=V(gMin5LCC)$outDegree,
majLangCount=V(gMin5LCC)$majLangCountAdj,
constraint=V(gMin5LCC)$constraint#,betweenness=V(gMin5LCC)$betweenness
)
#TODO: Remove outliers?
svg("tweetCount.svg",width=4,height=4)
plot <- ggplot(sumStats,aes(x=multilingual,y=tweetCount))
plot <- plot + geom_boxplot() + scale_y_log10("Number of tweets")
plot <- plot + scale_x_discrete("") + theme_bw()
plot
dev.off()
svg("inDegree.svg",width=4,height=4)
plot <- ggplot(sumStats,aes(x=multilingual,y=inDegree))
plot <- plot + geom_boxplot() + scale_y_log10("In-degree")
plot <- plot + scale_x_discrete("") + theme_bw()
plot
dev.off()
svg("outDegree.svg",width=4,height=4)
plot <- ggplot(sumStats,aes(x=multilingual,y=outDegree))
plot <- plot + geom_boxplot() + scale_y_log10("Out-degree")
plot <- plot + scale_x_discrete("") + theme_bw()
plot
dev.off()
svg("majLangCount.svg",width=4,height=4)
plot <- ggplot(sumStats,aes(x=multilingual,y=majLangCount))
plot <- plot + geom_boxplot() + scale_y_log10("Number of tweets in majority language")
plot <- plot + scale_x_discrete("") + theme_bw()
plot
dev.off()
ggplot(sumStats,aes(x=tweetCount,color=multilingual)) + geom_density() + scale_x_log10()
#TODO: Different linetypes for split variable??? (e.g., dashed for mono, solid for multi)
pTweetCount<-density.log(sumStats,"tweetCount","multilingual",n=2048,adjust=8,title="Number of tweets")
pIndegree<-density.log(sumStats,"inDegree","multilingual",n=2048,adjust=8,title="In-degree/visibility")
pOutdegree<-density.log(sumStats,"outDegree","multilingual",n=2048,adjust=8,title="Out-degree/reach")
pConstraint<-density.log(sumStats,"constraint","multilingual",n=2048,adjust=8,title="Burt's constraint")
#pBetweenness<-density.log(sumStats,"betweenness","multilingual",n=2048,adjust=8,title="Betweenness")
tmp <- ggplot_gtable(ggplot_build(pTweetCount$plot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
leg <- tmp$grobs[[leg]]
lheight <- sum(leg$height)
svg("introSummary2.svg",width=12,height=5)
grid.arrange(
pTweetCount$plot+theme(legend.position="none")+scale_x_log10("Number of tweets",
breaks = c(1,10,100),
labels = trans_format("log10", math_format(10^.x))),
pOutdegree$plot+theme(legend.position="none")+scale_x_log10("Out-degree/reach",
breaks = c(1,10,100),
labels = trans_format("log10", math_format(10^.x))),
pIndegree$plot+theme(legend.position="none")+scale_x_log10("In-degree/visibility",
breaks = c(1,10,100,1000,10000),
labels = trans_format("log10", math_format(10^.x))),
#Row 2
textGrob(""), leg, ncol=3,ncol=1,heights=unit.c(unit(1, "npc") - lheight, lheight)
)
dev.off()
svg("introSummary.svg",width=12,height=5)
grid.arrange(
pTweetCount$plot+theme(legend.position="none"),
pOutdegree$plot+theme(legend.position="none"),
pIndegree$plot+theme(legend.position="none"),
#Row 2
textGrob(""), leg, ncol=3,ncol=1,heights=unit.c(unit(1, "npc") - lheight, lheight)
)
dev.off()
#plot <- ggplot(subset(sumStats,tweetCount>=4),aes(x=tweetCount,y=inDegree,group=multilingual,color=multilingual))
#plot <- plot + geom_point() + scale_y_log10("Number of tweets in majority language")
#plot <- plot + scale_x_log10("") + theme_bw()
#plot <- ggplot(subset(sumStats,multilingual=="Multilingual"),aes(x=majLangPercent)) + geom_hist()
summary(sumStats$tweetCount[sumStats$multilingual=="Monolingual"])
sd(sumStats$tweetCount[sumStats$multilingual=="Monolingual"])
summary(sumStats$tweetCount[sumStats$multilingual=="Multilingual"])
sd(sumStats$tweetCount[sumStats$multilingual=="Multilingual"])
t.test(sumStats$tweetCount[sumStats$multilingual=="Monolingual"],sumStats$tweetCount[sumStats$multilingual=="Multilingual"])
wilcox.test(sumStats$tweetCount~sumStats$multilingual)
kruskal.test(sumStats$tweetCount~sumStats$multilingual)
summary(sumStats$outDegree[sumStats$multilingual=="Monolingual"])
sd(sumStats$outDegree[sumStats$multilingual=="Monolingual"])
summary(sumStats$outDegree[sumStats$multilingual=="Multilingual"])
sd(sumStats$outDegree[sumStats$multilingual=="Multilingual"])
t.test(sumStats$outDegree[sumStats$multilingual=="Monolingual"],sumStats$outDegree[sumStats$multilingual=="Multilingual"])
wilcox.test(sumStats$outDegree~sumStats$multilingual)
kruskal.test(sumStats$outDegree~sumStats$multilingual)
summary(sumStats$inDegree[sumStats$multilingual=="Monolingual"])
sd(sumStats$inDegree[sumStats$multilingual=="Monolingual"])
summary(sumStats$inDegree[sumStats$multilingual=="Multilingual"])
sd(sumStats$inDegree[sumStats$multilingual=="Multilingual"])
t.test(sumStats$inDegree[sumStats$multilingual=="Monolingual"],sumStats$inDegree[sumStats$multilingual=="Multilingual"])
wilcox.test(sumStats$inDegree~sumStats$multilingual)
kruskal.test(sumStats$inDegree~sumStats$multilingual)
summary(sumStats$constraint[sumStats$multilingual=="Monolingual"])
summary(sumStats$constraint[sumStats$multilingual=="Multilingual"])
#summary(sumStats$betweenness[sumStats$multilingual=="Monolingual"])
#summary(sumStats$betweenness[sumStats$multilingual=="Multilingual"])
########################
## Node removal -- This analysis is not included in the published paper. See Node removal simple.
########################
#see games.c
#gRand<-rewire(gMin5LCCUndirected,mode="simple",niter=1000)
#OR
#Copy majLangPercentRank across based on degreeRank
#V(gMin5LCCUndirected)$degreeRank<-rank(V(gMin5LCCUndirected)$degree,ties.method="random")
#V(gRand)$degree<-degree(gRand)
#V(gRand)$degreeRank<-rank(V(gRand)$degree,ties.method="random")
#ranks<-V(gMin5LCCUndirected)$majLangPercentRank[order(V(gMin5LCCUndirected)$degreeRank)]
#V(gRand)$majLangPercentRank<-ranks[order(V(gRand)$degreeRank)]
V(gMin5LCCUndirected)$multilingual[V(gMin5LCCUndirected)$majLangPercentAdj!=1]<-"Multilingual"
V(gMin5LCCUndirected)$multilingual[V(gMin5LCCUndirected)$majLangPercentAdj==1]<-"Monolingual"
#TODO: Consider comparing to attack algorithm -- e.g. remove most central nodes first (by k-cores?)
# Betweenness centrality does not work as it changes as nodes are removed
# Could consdier comparing to removing highest degree first -- Did this (I think): multilinguals created less components than highest degree first
runNodeRemoval <- function(vals) {
# as many rows & cols as needed; don't know levels yet
GRAPHSIZE<-length(V(gMin5LCCUndirected))
#vals<-seq(0.01,0.99,0.03)#c(.01,.02,.03,.04,.05,.06,.07,.08,.09,.1,.2,.3,.4,.5,.6,.7,.8,.9)
#vals<-seq(0.01,0.1,0.005)
N <- length(vals)*3
df <- data.frame(val=rep(NA, N), components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), constraint=rep(NA, N), numEdges=rep(NA,N), numNodes=rep(NA,N), method=rep("", N), stringsAsFactors=FALSE)
i<-0
#Multilinguals first
for (x in vals) {
print(x)
y<-x*GRAPHSIZE
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank>=y)
#print(summary(tmp))
#clusters<-fastgreedy.community(tmp)
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multiFirst")
}
#Monolinguals first
for (x in vals) {
print(x)
y<-(1-x)*GRAPHSIZE
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank<=y)
#print(summary(tmp))
#clusters<-fastgreedy.community(tmp)
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "monoFirst")
}
#Randomly permute majLangPercent and repeat the above
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank)
for (x in vals) {
print(x)
y<-x*GRAPHSIZE
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$permuted>=y)
#print(summary(tmp))
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random")
}
#Compare to gRand
for (x in vals) {
print(x)
y<-x*GRAPHSIZE
tmp<-induced.subgraph(gRand,V(gRand)$majLangPercentRank>=y)
#print(summary(tmp))
#clusters<-fastgreedy.community(tmp)
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random-rewire")
}
#High degree first
#V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank)
#for (x in vals) {
# print(x)
# y<-x*GRAPHSIZE
# tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$degreeRank>=y)
# print(summary(tmp))
# components<-clusters(tmp,mode="weak")
# print(components$no)
# m<-max(components$csize)
# i<-i+1
# df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), "highFirst")
#}
return(df)
}
if (NUM.CORES>1) {
cl <- makeCluster(NUM.CORES)
registerDoParallel(cl, cores = NUM.CORES)
}
dfAvg <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"),
.combine = rbind) %dopar% {
print(paste0("Run: ",run))
GRAPHSIZE<-length(V(gMin5LCCUndirected))
V(gMin5LCCUndirected)$degree<-degree(gMin5LCCUndirected, mode = "all")
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random")
gRand<-degree.sequence.game(degree(gMin5LCCUndirected),method="vl")
V(gRand)$majLangPercentRank<-V(gMin5LCCUndirected)$majLangPercentRank
df<-runNodeRemoval(seq(0.01,0.99,0.01))
df$edgesRemoved<-ecount(gMin5LCCUndirected)-as.numeric(df$numEdges)
write.csv(df,paste0("output_df_randomOrder_withCounts_",run,".csv"))
return(df)
}
warnings()
if (NUM.CORES>1) {
stopCluster(cl)
}
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$multilingual=="Monolingual")
1-ecount(tmp)/ecount(gMin5LCCUndirected)
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentAdj>=.6)
1-ecount(tmp)/ecount(gMin5LCCUndirected)
sum(V(gMin5LCCUndirected)$multilingual=="Multilingual")/vcount(gMin5LCCUndirected)
sum(V(gMin5LCCUndirected)$majLangPercentAdj<.6)/vcount(gMin5LCCUndirected)
#dfZoom<-runNodeRemoval(seq(0.01,0.2,0.01))
#write.csv(dfZoom,"output_dfZoom.csv")
#df<-read.csv("output2.csv")
df<-dfAvg
df$val<-as.numeric(df$val)
df$components<-as.numeric(df$components)
df$lccSize<-as.numeric(df$lccSize)
df$avgSize<-as.numeric(df$avgSize)
df$constraint<-as.numeric(df$constraint)
df$numEdges<-as.numeric(df$numEdges)
df$numNodes<-as.numeric(df$numNodes)
df$edgesRemoved<-as.numeric(df$edgesRemoved)
df<-ddply(df,.(val,method),summarize,
components=mean(components),lccSize=mean(lccSize),
avgSize=mean(avgSize),constraint=mean(constraint),
numEdges=mean(numEdges),numNodes=mean(numNodes),edgesRemoved=mean(edgesRemoved))
write.csv(df,"output_df_randomOrder_withCounts_avg100.csv")
png("no_components.png",width=pngWidth,height=pngHeight)
plot<-ggplot(df,aes(x=val,y=components,color=method,group=method))
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent)
plot
dev.off()
png("lccSize.png",width=pngWidth,height=pngHeight)
plot<-ggplot(df,aes(x=val,y=lccSize,color=method,group=method))
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent)
plot
dev.off()
png("avgSize.png",width=pngWidth,height=pngHeight)
plot<-ggplot(df,aes(x=val,y=avgSize,color=method,group=method))
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent)
plot
dev.off()
png("constraint.png",width=pngWidth,height=pngHeight)
plot<-ggplot(df,aes(x=val,y=constraint,color=method,group=method))
plot<-plot + geom_line()
plot
dev.off()
########################
## Node removal simple -- This is included
########################
runNodeRemovalSimple <- function(gMin5LCCUndirected,gRand) {
nodesToRemove<-length(V(gMin5LCCUndirected)[V(gMin5LCCUndirected)$majLangPercentAdj<1])
GRAPHSIZE<-length(V(gMin5LCCUndirected))
N <- 4
df <- data.frame(components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), constraint=rep(NA, N), numEdges=rep(NA,N), numNodes=rep(NA,N), method=rep("", N), stringsAsFactors=FALSE)
i<-0
print("Multilinguals (all removed)")
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentAdj==1)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multi")
print("Monolinguals (random subset)")
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank<=(GRAPHSIZE-nodesToRemove))
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "mono")
#Randomly permute majLangPercent and repeat the above
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank)
print("Random subset")
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$permuted>nodesToRemove)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random")
print("Multilinguals (all, edge rewiring)")
tmp<-induced.subgraph(gRand,V(gRand)$majLangPercentAdj==1)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multiRewire")
print("Highest degree subset")
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$degreeRank>nodesToRemove)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "highDegree")
return(df)
}
if (NUM.CORES>1) {
cl <- makeCluster(NUM.CORES)
registerDoParallel(cl, cores = NUM.CORES)
}
dfNodeRemoval <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"),
.combine = rbind) %dopar% {
print(paste0("Run: ",run))
GRAPHSIZE<-length(V(gMin5LCCUndirected))
V(gMin5LCCUndirected)$degree<-degree(gMin5LCCUndirected, mode = "all")
V(gMin5LCCUndirected)$degreeRank<-GRAPHSIZE-rank(V(gMin5LCCUndirected)$degree,ties.method="random")
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random")
gRand<-degree.sequence.game(degree(gMin5LCCUndirected),method="vl")
V(gRand)$majLangPercentRank<-V(gMin5LCCUndirected)$majLangPercentRank
V(gRand)$majLangPercentAdj<-V(gMin5LCCUndirected)$majLangPercentAdj
df<-runNodeRemovalSimple(gMin5LCCUndirected,gRand)
df$edgesRemoved<-ecount(gMin5LCCUndirected)-as.numeric(df$numEdges)
df$run<-run
write.csv(df,paste0("output_dfNodeRemovalSimple_",run,".csv"))
return(df)
}
write.csv(dfNodeRemoval,"dfNodeRemovalSimple.csv")
warnings()
if (NUM.CORES>1) {
stopCluster(cl)
}
conf95<-function(df,var) {
m<-mean(df[,var])
s<-sd(df[,var])
tmp<-data.frame(
m,
s,
m+(1.96*s),
m-(1.96*s),
max(df[,var]),
min(df[,var])
)
eval(names(tmp)<-c(
paste0(var,"Mean"),paste0(var,"SD"),
paste0(var,"95Max"),paste0(var,"95Min"),
paste0(var,"Max"),paste0(var,"Min")
))
return(tmp)
}
meanSD<-function(df) {
tmp<-data.frame(
lccSize=mean(df$lccSize),lccSizeSD=sd(df$lccSize),
avgSize=mean(df$avgSize),avgSizeSD=sd(df$avgSize),
components=mean(df$components),componentsSD=sd(df$components)
)
lccSize<-conf95(df,"lccSize")
avgSize<-conf95(df,"avgSize")
components<-conf95(df,"components")
#print(tmp)
return(cbind(lccSize,avgSize,components))
}
#dfNodeRemoval<-read.csv("dfNodeRemovalSimple.csv")
dfNodeRemovalAvg<-ddply(dfNodeRemoval,.(method),meanSD)
dfNodeRemovalAvg
dfNodeRemovalAvg<-subset(dfNodeRemovalAvg,method!="highDegree")
#bar chart
ggplot(dfNodeRemovalAvg,aes(x=method,y=componentsMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=components95Max,ymin=components95Min)) + theme_bw()
ggplot(dfNodeRemovalAvg,aes(x=method,y=lccSizeMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=lccSize95Max,ymin=lccSize95Min)) + theme_bw()
ggplot(dfNodeRemovalAvg,aes(x=method,y=avgSizeMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=avgSize95Max,ymin=avgSize95Min)) + theme_bw()
#boxplots
dfNodeRemoval<-subset(dfNodeRemoval,method!="highDegree")
dfNodeRemoval$label<-factor(dfNodeRemoval$method,levels=c("multi","mono","random","multiRewire"),labels=c("Multilinguals","\nMonolinguals","Random","\nMultilinguals\n(edges rewired)"))
dfNodeRemovalAvg$label<-factor(dfNodeRemovalAvg$method,levels=c("multi","mono","random","multiRewire"),labels=c("Multilinguals","\nMonolinguals","Random","\nMultilinguals\n(edges rewired)"))
#xscale<-scale_x_discrete("",
# breaks=c("multi","mono","multiRewire","random"),
# labels=c("Multilinguals","\nMonolinguals","\nMultilinguals\n(Edges rewired)","Random")
#)
xscale<-scale_x_discrete("")
cPlot<-ggplot(dfNodeRemoval,aes(x=label,y=components))+geom_boxplot()+theme_bw()
cPlot<-cPlot+xscale+scale_y_continuous("Number of components",labels=comma)
cPlot<-cPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=componentsMean),shape=3)#3=+, 4=*
lccPlot<-ggplot(dfNodeRemoval,aes(x=label,y=lccSize))+geom_boxplot()+theme_bw()
lccPlot<-lccPlot+xscale+scale_y_continuous("Size of LCC",labels=comma)
lccPlot<-lccPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=lccSizeMean),shape=3)
avgPlot<-ggplot(dfNodeRemoval,aes(x=label,y=avgSize))+geom_boxplot()+theme_bw()
avgPlot<-avgPlot+xscale+scale_y_continuous("Average component size (not including LCC)",labels=comma)
avgPlot<-avgPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=avgSizeMean),shape=3)
svg("nodeRemovalSimple.svg",width=12,height=5)
grid.arrange(lccPlot,cPlot,avgPlot,nrow=1,ncol=3)
dev.off()
#Addl -- compare % left in LCC to having removed all speakers of a given language (dfRemoveLangs)
nodesToRemove<-length(V(gMin5LCCUndirected)[V(gMin5LCCUndirected)$majLangPercentAdj<1])
dfNodeRemovalAvg$lccPercent<-1-(dfNodeRemovalAvg$lccSizeMean/(length(V(gMin5LCC))-nodesToRemove))
dfNodeRemovalAvg[,c("method","lccPercent")]
dfRemoveLangs[dfRemoveLangs=="en","lccFrac"]
###############################
## Language analysis - remove nodes from one language at a time
##############################
#\label{tbl:topLangs}
#TOP Langs: Table tbl:topLangs
sort(table(V(gMin5LCCUndirected)$majLangAdj))
langs<-c("en","ja","pt","ms","es","nl","ko","th")
#Avg. tweets per lang
dfTmp<-data.frame(majLangAdj=V(gMin5LCCUndirected)$majLangAdj,tweetCount=V(gMin5LCCUndirected)$tweetCount)
dfTmp<-ddply(dfTmp,.(majLangAdj),summarize,meanTweets=round(mean(tweetCount),2),tweetSD=round(sd(tweetCount),2),tweetMin=min(tweetCount),tweetMax=max(tweetCount),tweetMed=median(tweetCount),sumTweets=sum(tweetCount),numUsers=length(tweetCount))
dfTmp[order(dfTmp$numUsers,decreasing=TRUE),c("majLangAdj","numUsers","meanTweets","tweetSD","tweetMin","tweetMax","tweetMed")]
print(dfTmp[order(dfTmp$numUsers,decreasing=TRUE),c("majLangAdj","numUsers","meanTweets","tweetSD")],row.names=FALSE)
GRAPHSIZE<-length(V(gMin5LCCUndirected))
vals<-seq(0.01,0.3,0.01)
N <- length(vals)*(length(langs)+1)
if (NUM.CORES>1) {
cl <- makeCluster(NUM.CORES)
registerDoParallel(cl, cores = NUM.CORES)
}
dfLang100 <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"),
.combine = rbind) %dopar% {
dfLang <- data.frame(val=rep(NA, N), components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), lang=rep("", N),stringsAsFactors=FALSE)
i<-0
print(paste0("Run: ",run))
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random")
V(gMin5LCCUndirected)$majLangPercentRankRev<-GRAPHSIZE-V(gMin5LCCUndirected)$majLangPercentRank
for (lang in langs) {
ranks<-sort(V(gMin5LCCUndirected)$majLangPercentRankRev[V(gMin5LCCUndirected)$majLangAdj==lang])
for (x in vals) {
print(paste(lang,x))
y<-x*GRAPHSIZE
y<-ranks[min(y,length(ranks))]
tmp<-induced.subgraph(gMin5LCCUndirected,
V(gMin5LCCUndirected)$majLangAdj!=lang | V(gMin5LCCUndirected)$majLangPercentRankRev>=y)
#print(summary(tmp))
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
dfLang[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), lang)
}
}
#Randomly permute majLangPercentRev and repeat the above
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRankRev)
for (x in vals) {
print(paste("random",x))
y<-x*GRAPHSIZE
tmp<-induced.subgraph(gMin5LCCUndirected,
V(gMin5LCCUndirected)$permuted>=y)
#print(summary(tmp))
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
dfLang[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), "random")
}
write.csv(dfLang,paste0("dfLang_run_",run,".csv"))
return(dfLang)
}
dfLang<-dfLang100
dfLang$val<-as.numeric(dfLang$val)
dfLang$components<-as.numeric(dfLang$components)
dfLang$lccSize<-as.numeric(dfLang$lccSize)
dfLang$avgSize<-as.numeric(dfLang$avgSize)
write.csv(dfLang,"dfLangAllRuns.csv")
dfLang<-ddply(dfLang,.(val,lang),summarize,
components=mean(components),lccSize=mean(lccSize),
avgSize=mean(avgSize))
write.csv(dfLang,"dfLang_avg100.csv")
warnings()
if (NUM.CORES>1) {
stopCluster(cl)
}
#scale_color_brewer(type="qual")
dfLang<-subset(dfLang,lang %in% langs)
dfLang$lang[dfLang$lang=="random"]<-"Random"
svg("lang_no_components.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfLang,aes(x=val,y=components,color=lang,group=lang))
plot<-plot + geom_point() + geom_line(size=1) + scale_color_discrete("Language")#,type="qual")
plot<-plot + scale_x_continuous('% of users removed',labels=percent)
plot<-plot + scale_y_continuous("Number of components",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot<-plot + geom_text(aes(x=.31,y=components,color=lang,label=lang),data=subset(dfLang,val==0.3))
plot
dev.off()
dfEnding<-subset(dfLang,val==.3)
dfEnding[order(dfEnding$components), ]
svg("lang_lccSize.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfLang,aes(x=val,y=lccSize,color=lang,group=lang))
plot<-plot + geom_point() + geom_line(size=1) + scale_color_discrete("Language")#,type="qual")
plot<-plot + scale_x_continuous('% of users removed',labels=percent)
plot<-plot + scale_y_continuous("Size of largest connected component",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16))
plot
dev.off()
#TODO: Look at direct.label
table(dfLang$lang[dfLang$val==.3],dfLang$components[dfLang$val==.3])
#Ending num. components for each language (val==.3)
#en 48,190
#es 1,923
#in 6,864
#ja 385
#ko 184
#ms 8,626
#nl 965
#pt 2,211
#random 67,206
V(gMin5LCCUndirected)$constraint<-constraint(gMin5LCCUndirected)
dfGraph<-data.frame(
majLangPercentAdj=V(gMin5LCCUndirected)$majLangPercentAdj,
tweetCount=V(gMin5LCCUndirected)$tweetCountAdj,
inDegree=V(gMin5LCCUndirected)$inDegree,
constraint=V(gMin5LCCUndirected)$constraint
)
#More graphs
png("tweetCount_majLangPercent.png",width=pngWidth,height=pngHeight)
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=tweetCount)) + geom_point()
plot
dev.off()
png("inDegree_majLangPercent.png",width=pngWidth,height=pngHeight)
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=inDegree)) + geom_point()
plot
dev.off()
png("constraint_majLangPercent.png",width=pngWidth,height=pngHeight)
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=constraint)) + geom_point()
plot
dev.off()
V(gMin5LCCUndirected)$label[V(gMin5LCCUndirected)$inDegree>40000]#justinbieber
V(gMin5LCCUndirected)$label[V(gMin5LCCUndirected)$inDegree>10000]#SlGNO
#Remove nodes from lang and calculate fraction of reminaing nodes in LCC
langs<-as.data.frame(table(V(gMin5LCCUndirected)$majLangAdj))
langs<-subset(langs,langs$Freq>1000)$Var1
N<-length(langs)
dfRemoveLangs<-data.frame(lang=rep("",N),components=rep(NA,N),lccSize=rep(NA,N),
numNodes=rep(NA,N),num2=rep(NA,N),stringsAsFactors=FALSE)
i<-0
for (lang in langs) {
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangAdj!=lang)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
dfRemoveLangs[i, ] <- list(lang, components$no, m, length(V(tmp)), sum(components$csize[components$csize!=m]))
}
dfRemoveLangs$numNodes==dfRemoveLangs$num2+dfRemoveLangs$lccSize
dfRemoveLangs$lccFrac<-(dfRemoveLangs$numNodes-dfRemoveLangs$lccSize)/dfRemoveLangs$numNodes
svg("lang_removed.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfRemoveLangs,aes(x=lang,y=lccFrac))
plot<-plot + geom_bar(stat="identity")
plot<-plot + scale_y_continuous('% of nodes not in largest-connected component',labels=percent)
plot<-plot + scale_x_discrete("Language removed")
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangAdj!="ms" & V(gMin5LCCUndirected)$majLangAdj!="in")
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
(length(V(tmp))-m)/length(V(tmp))
###################################################
## Q1: Is language homophilous? ##
###################################################
#take largest *strongly* connected component?
#labelPropComm<-label.propagation.community(gMin5LCCUndirected)
#write.csv(labelPropComm$membership,"labelPropCommMembership.csv")
commsFile<-"iter72memberships_smart.txt" #This file comes from the label propagation code in Java
if (!is.na(commsFile)){
memberships<-read.csv(commsFile,header=FALSE,sep=" ")
#Begin Untested code
tmp<-read.csv("usernames.txt",header=FALSE)
sum(tmp!=V(gMin5LCCUndirected)$label)#This should be 0, else the lists are malaligned!
#End untested code
modularity(gMin5LCCUndirected,memberships$V2)
#[1] 0.7471879
#in-ms: 0.8054294
N <- max(memberships$V2)
dfComms <- data.frame(num=seq(1, N), size=rep(0, N), numLangs=rep(0, N), majLangCount=rep(0,N), majLang=rep("", N),stringsAsFactors=FALSE)
for (comm in seq(1,N)) {
size<-length(memberships$V2[memberships$V2==comm])
langs<-V(gMin5LCCUndirected)$majLangAdj[memberships$V2==comm]
numLangs<-length(table(langs))
majLang <- unique(langs)
majLang <- majLang[which.max(tabulate(match(langs,majLang)))]
majLangCount<-sum(langs==majLang)
dfComms[comm, ] <- list(comm, size, numLangs, majLangCount, majLang)
}
write.csv(dfComms,"dfCommunities.csv")
#dfComms<-read.csv("community_info_iter36.csv")
#dfComms$label<-dfComms$label+1
sum(dfComms$numLangs==1)
sum(dfComms$numLangs==1)/length(dfComms$numLangs) #A large number of components have one language only
sum(dfComms$size[dfComms$numLangs==1])/sum(dfComms$size) #But these components represent a small number of users
#Five large components have 61% of users, #\label{tbl:topCommunities}
sum(dfComms$size[dfComms$size>10^4])/sum(dfComms$size)
dfComms[dfComms$size>10^4, ]
top<-dfComms[dfComms$size>10^4, ]
top$majLangPercent<-round((top$majLangCount/top$size)*100,1)
top[order(top$size,decreasing=TRUE), c("majLang","majLangPercent","numLangs","size")]
head(dfComms[order(dfComms$size,decreasing=TRUE), c("majLang","majLangPercent","numLangs","size")],n=25)
sort(table(dfComms$majLang))
#Is 48 the total number of langs in the network?
length(unique(V(gMin5LCCUndirected)$majLangAdj))
dfComms$majLangPercent<-dfComms$majLangCount/dfComms$size
summary(dfComms)
with(dfComms,plot(log(size),majLangPercent))
max(dfComms$size)
dfComms$label[dfComms$size==max(dfComms$size)]
#label 16 is quite large
svg("communities_size_percent.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,color=majLang))
plot<-plot+geom_point()
plot<- plot + scale_x_log10("Cluster size",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("communities_size_percent_labels.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,label=majLang))
plot<-plot+geom_text()
plot<- plot + scale_x_log10("Cluster size",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("communities_size_percent_labels_partial.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,color=majLang))
plot<-plot+geom_point()
plot<-plot+geom_text(data=subset(dfComms,size>10^4),aes(x=size,y=majLangPercent+0.025,label=majLang))
plot<- plot + scale_x_log10("Cluster size",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
dfComms[dfComms$size>10^4, ]
#hist(log(dfComms$size))
svg("community_size.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=size))
plot<-plot+geom_histogram(binwidth=0.15)
plot<- plot + scale_x_log10("Cluster size",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Frequency",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("community_numLangs_hist.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=numLangs))
plot<-plot+geom_histogram(binwidth=1)
plot<- plot + scale_x_continuous("Number of languages per cluster")
#plot<-plot + scale_y_continuous("Percent of users in communities with this number of languages or fewer (cdf)",labels=percent)
plot<-plot + scale_y_continuous("Frequency",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("community_numLangs_hist_zoom.svg",width=svgWidth,height=svgHeight)
zplot<-ggplot(subset(dfComms,numLangs<=5),aes(x=as.factor(numLangs)))
zplot<-zplot+geom_histogram(binwidth=1)
zplot<-zplot + scale_x_discrete("Number of languages per cluster")
zplot<-zplot + scale_y_continuous("Frequency",labels=comma)
zplot<-zplot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
zplot
dev.off()
#dfDensity<-densityDataframe(dfComms,"numLangs",NA,n=2048,adjust=8)
#dplot <- ggplot(dfDensity,aes(x=x10,y=y)) + geom_path()
dfTmp<-as.data.frame(table(dfComms$numLangs))
insert<-ggplot(dfTmp,aes(x=as.numeric(Var1),y=Freq))+geom_point()+
scale_y_log10("",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))+
scale_x_continuous("")+ theme_bw()
svg("community_numLangs_hist_insert.svg",width=svgWidth,height=svgHeight)
print(zplot)
#print(plot+scale_x_continuous("")+scale_y_continuous("",labels=comma),
print(insert,
vp=viewport(width=0.6,height=0.6,x=1,y=1,just=c("right","top")))
dev.off()
N <- max(memberships$V2)
if (NUM.CORES>1) {
cl <- makeCluster(NUM.CORES)
registerDoParallel(cl, cores = NUM.CORES)
}
dfCommsShuffled100 <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"),
.combine = rbind) %dopar% {
dfCommsShuffled <- data.frame(num=seq(1, N), size=rep(0, N), numLangs=rep(0, N), majLangCount=rep(0,N), majLang=rep("", N),stringsAsFactors=FALSE)
print(paste0("Run: ",run))
V(gMin5LCCUndirected)$majLangAdjShuffled<-sample(V(gMin5LCCUndirected)$majLangAdj)#Random permutation
for (comm in seq(1,N)) {
size<-length(memberships$V2[memberships$V2==comm])
langs<-V(gMin5LCCUndirected)$majLangAdjShuffled[memberships$V2==comm]
numLangs<-length(table(langs))
majLang <- unique(langs)
majLang <- majLang[which.max(tabulate(match(langs,majLang)))]
majLangCount<-sum(langs==majLang)
dfCommsShuffled[comm, ] <- list(comm, size, numLangs, majLangCount, majLang)
}
write.csv(dfCommsShuffled,paste0("dfCommsShuffled_run_",run,".csv"))
return(dfCommsShuffled)
}
#Collapse dfComms
write.csv(dfCommsShuffled100,"dfCommsShuffled.csv")
dfCommsShuffled<-ddply(dfCommsShuffled100,.(num),summarize,
size=mean(size),sizeSD=sd(size),
numLangs=mean(numLangs),numLangsSD=sd(numLangs),
majLangCount=mean(majLangCount),majLangCountSD=sd(majLangCount),
majLang="skip")
dfCommsShuffled$majLangPercent<-dfCommsShuffled$majLangCount/dfCommsShuffled$size
warnings()
if (NUM.CORES>1) {
stopCluster(cl)
}
svg("community_numLangs_hist_shuffled.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfCommsShuffled,aes(x=numLangs))
plot<-plot+geom_histogram(binwidth=1)
plot<- plot + scale_x_continuous("Number of languages per community (when language labels are randomly shuffled)")
plot<-plot + scale_y_continuous("Frequency",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
summary(dfComms$numLangs)
summary(dfCommsShuffled$numLangs)
length(dfComms$numLangs[dfComms$numLangs==1])
length(dfCommsShuffled$numLangs[dfCommsShuffled$numLangs==1])
dfComms$shuffled<-"Observed data"
dfCommsShuffled$shuffled<-"Language labels shuffled"
dfTmp<-rbind(dfComms[,c("size","numLangs","majLangCount","shuffled")],dfCommsShuffled[,c("size","numLangs","majLangCount","shuffled")])
dfDensity<-densityDataframe(dfTmp,"numLangs","shuffled",n=2048,adjust=6)
dfDensitySum<-densitySummary(dfTmp,"numLangs","shuffled")
pNumLangs<-densityPlot(dfDensity,dfDensitySum,"Number of languages per community")
svg("community_numLangs_density.svg",width=svgWidth,height=svgHeight)
pNumLangs
dev.off()
#difference of means t-test
t.test(x=dfComms$numLangs,y=dfCommsShuffled$numLangs,alternative="two.sided")
# p-value < 2.2e-16
}
#spinglassComm<-spinglass.community(gMin5LCCUndirected, weights=NA)
#write.csv(spinglassComm$membership,"spinglassCommMembership.csv")
modularity(gMin5LCCUndirected,as.factor(V(gMin5LCCUndirected)$majLangAdj))
#[1] 0.6489853 ge2?
#[1] 0.6291782 ge4_ge2-20
#[1] 0.6671592 ge4_ge2-20_inms
###################################################
## Q3: Lang outwardness ##
###################################################
#TODO: Include all languages????
sort(table(V(gMin5LCCUndirected)$majLangAdj))
#langList<-sort(c("in","es","ms","pt","ja","en","ru","de","tr","it","fil","fr","ar","th","ko","nl"))
#gLangs<-induced.subgraph(gMin5LCC,V(gMin5LCC)$majLangAdj %in% langList)
langList<-sort(unique(V(gMin5LCCUndirected)$majLangAdj))#Must be in same order as langSizes!!!
gLangs<-gMin5LCC
sort(table(V(gLangs)$majLangAdj))
#langs <- unique(V(gLangs)$majLangAdj)
langSizes<-table(V(gLangs)$majLangAdj)
totalNodes<-length(V(gLangs)) #Check if this needs $something?
totalEdges<-length(E(gLangs)) #Check if this needs $weight?
#Check distribution of majLangPercentAdj for different languages
library(plyr)
dfMultilingual<-data.frame(lang=V(gMin5LCCUndirected)$majLangAdj,percent=V(gMin5LCCUndirected)$majLangPercentAdj)
dfMultilingual$lang<-as.factor(dfMultilingual$lang)
dfMultiCollapse<-ddply(dfMultilingual,.(lang),function(df) {c(length(df$percent),min(df$percent),max(df$percent),mean(df$percent),median(df$percent),length(df$percent[df$percent!=1]))})
names(dfMultiCollapse)<-c("lang","nodeCount","min","max","mean","median","multilingualCount")
dfMultiCollapse$multiPercent<-dfMultiCollapse$multilingualCount/dfMultiCollapse$nodeCount
largeLangs<-as.character(dfMultiCollapse$lang[dfMultiCollapse$nodeCount>1000])
plot(log(dfMultiCollapse$nodeCount),dfMultiCollapse$multiPercent)
dfMultiCollapse[order(dfMultiCollapse$nodeCount),c("lang","nodeCount","multiPercent")]
dfMultiCollapseSub<-subset(dfMultiCollapse,nodeCount>1000)
with(dfMultiCollapseSub,plot(log(nodeCount),multiPercent))
svg("multilingualism_ge1000.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(subset(dfMultiCollapse,nodeCount>1000),aes(x=nodeCount,y=multiPercent))
plot<-plot+geom_point()
plot<- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("multilingualism.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent))
plot<-plot+geom_point()
plot<- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("multilingualism-labels.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent,label=lang))
plot<-plot+geom_text()#Use size= here to control size
plot<- plot + scale_x_log10("Language size (number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("multilingualism-labels_and_point.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent,label=lang)) + geom_point()
plot<-plot+geom_text(aes(y=multiPercent+0.02))#Use size= here to control size
plot<- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot#+stat_smooth(method="loess")
#method="lm","gam"/library(mgcv) (http://www.inside-r.org/r-doc/mgcv/gam)
dev.off()
#Correlation between size/mutlilingualism?
cor(dfMultiCollapse$nodeCount,dfMultiCollapse$multiPercent)
with(dfMultiCollapse,cor(log(nodeCount),multiPercent))
with(subset(dfMultiCollapse,nodeCount<=1000),cor(log(nodeCount),multiPercent))
with(subset(dfMultiCollapse,nodeCount<=10^5),cor(log(nodeCount),multiPercent))
coef(lm(log(nodeCount) ~ multiPercent, data = dfMultiCollapse))
cor.test(dfMultiCollapse$nodeCount,dfMultiCollapse$multiPercent,alternative="less",conf.level=0.95)
cor.test(log(dfMultiCollapse$nodeCount),dfMultiCollapse$multiPercent,alternative="less",conf.level=0.95)
# method = c("pearson", "kendall", "spearman")
#dfMultiCollapseSub[order(dfMultiCollapseSub$multiPercent),c("lang","nodeCount","multiPercent")]
rm(dfMultiCollapseSub)
dfMultiCollapse[order(dfMultiCollapse$multiPercent),c("lang","nodeCount","multiPercent")]
dfMultiCollapse[order(dfMultiCollapse$nodeCount),c("lang","nodeCount","multiPercent")]
N<-length(langList)
dfOut <- data.frame(lang=rep("", N), nodeCount=rep(0, N), edgeCount=rep(0, N),
inLangEdges=rep(0, N),expInLangEdges=rep(0, N),
stringsAsFactors=FALSE)
i<-0
for (lang in langList) {
#langEdges<-length(
# E(gLangs)[E(gLangs)$source %in%
# V(gLangs)[V(gLangs)$majLangAdj==lang]
# ])
langEdges<-sum(degree(gLangs,V(gLangs)$majLangAdj==lang,mode="out"))
inGroupEdges<-
length(E(induced.subgraph(gLangs,V(gLangs)$majLangAdj==lang)))
#bwGroupEdges<-langEdges-inGroupEdges
expInGroupEdges<-(as.numeric(langSizes[lang])/totalNodes)*langEdges
i<-i+1
dfOut[i, ] <- list(lang,as.numeric(langSizes[lang]),langEdges,inGroupEdges,expInGroupEdges)
}
#dfOut$nodeCount<-as.numeric(dfOut$nodeCount)
#dfOut$edgeCount<-as.numeric(dfOut$edgeCount)
#dfOut$inLangEdges<-as.numeric(dfOut$inLangEdges)
#dfOut$expInLangEdges<-as.numeric(dfOut$expInLangEdges)
dfOut$diff<-dfOut$inLangEdges-dfOut$expInLangEdges
dfOut$fracIn<-dfOut$inLangEdges/dfOut$edgeCount
dfOut$expFrac<-dfOut$expInLangEdges/dfOut$edgeCount
dfOut$nodeFrac<-dfOut$nodeCount/totalNodes
dfOut$pError<-dfOut$diff/dfOut$expInLangEdges
dfOut$diffPercent<-abs(dfOut$diff)/((dfOut$inLangEdges+dfOut$expInLangEdges)/2)
dfOut$logFracIn<-log(dfOut$inLangEdges)/log(dfOut$edgeCount)
dfOut$zscore<-(dfOut$inLangEdges-mean(dfOut$inLangEdges))/sd(dfOut$inLangEdges)
write.csv(dfOut,"dfOut_directed.csv")
with(subset(dfOut,fracIn>0.8),plot(log(nodeCount),fracIn))
source("../tweet-langs-inms.R")
dfTweetLangs<-tweetLangAll()
dfTweetLangs<-subset(dfTweetLangs,lang %in% langList)
totalTweets<-sum(dfTweetLangs$count)
dfTweetLangs$percent<-dfTweetLangs$count/totalTweets
dfOut$tweetCount<-NA
dfOut$tweetPercent<-NA
for (lang in langList) {
dfOut$tweetCount[dfOut$lang==lang]<-dfTweetLangs$count[dfTweetLangs$lang==lang]
dfOut$tweetPercent[dfOut$lang==lang]<-dfTweetLangs$percent[dfTweetLangs$lang==lang]
}
dfOut$tweetExpEdges<-dfOut$tweetPercent*dfOut$edgeCount
dfOut$tweetdiff<-dfOut$inLangEdges-dfOut$tweetExpEdges
#Table \label{tbl:inwardness}
dfSum<-subset(dfOut,nodeCount>1000)[ , c("lang","nodeFrac","fracIn") ]
dfSum$nodeFrac<-round(dfSum$nodeFrac*100,2)
dfSum$fracIn<-round(dfSum$fracIn*100,2)
dfSum[order(dfSum$fracIn, decreasing=TRUE), ]
ggplot(dfSum,aes(label=lang,x=nodeFrac,y=fracIn))+geom_text()
#ident <- function(x) {(x/totalNodes)*totalEdges}
#
plot <- ggplot(dfOut,aes(x=nodeCount,y=fracIn)) + geom_point()
#plot <- plot + stat_function(fun = ident, linetype="dashed")
plot <- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot
#ident <- function(x) {x}
plot <- ggplot(dfOut,aes(x=nodeCount,y=diff)) + geom_point()
#plot <- plot + stat_function(fun = ident, linetype="dashed")
plot <- plot + stat_smooth(method="lm", se=TRUE)
plot <- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot <- plot + scale_y_log10("Difference from expected (log)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot
dfA<-data.frame(lang=rep("", N),var=rep("", N),val=rep(0, N))
dfA$lang<-dfOut$lang
dfA$var<-"fracIn"
dfA$val<-dfOut$fracIn
dfB<-data.frame(lang=rep("", N),var=rep("", N),val=rep(0, N))
dfB$lang<-dfOut$lang
dfB$var<-"fracExp"
dfB$val<-dfOut$expFrac
dfTrans<-rbind(dfA,dfB)
rm(dfA)
rm(dfB)
dfTrans$nodeCount<-dfOut$nodeCount
plot <- ggplot(dfTrans,aes(x=nodeCount,y=val,group=var,color=var)) + geom_point()
plot <- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
#plot <- plot + scale_y_log10("Difference from expected (log)",
# breaks = trans_breaks("log10", function(x) 10^x),
# labels = trans_format("log10", math_format(10^.x)))
plot
#E(gLangs)$sourceLang<-""
#E(gLangs)$targetLang<-""
#edges<-get.edges(gLangs, E(gLangs))
#edgeLangs<-data.frame(source=rep("",length(edges)), target=rep("",length(edges)),stringsAsFactors=FALSE)
#i<-0
#for (edge in edges) {
# s<-edges[edge,1]
# t<-edges[edge,2]
# sLang<-V(gLangs)[s]$majLangAdj
# tLang<-V(gLangs)[t]$majLangAdj
# i<-i+1
# edgeLangs[i, ] <- c(sLang,tLang)
#}
#gCollapsed<-contract.vertices(gLangs, which(langList==V(gLangs)$majLangAdj))
levels<-as.factor(V(gLangs)$majLangAdj)
edges<-get.edges(gLangs, E(gLangs))
E(gLangs)$source<-edges[,1]
E(gLangs)$target<-edges[,2]
gCollapsed<-contract.vertices(gLangs, levels,vertex.attr.comb="first")
try({
gCollapsed<-remove.edge.attribute(gCollapsed, "sourceLang")
gCollapsed<-remove.edge.attribute(gCollapsed, "targetLang")
})
gCollapsed<-simplify(gCollapsed,remove.loops=TRUE,remove.multiple=FALSE)
E(gCollapsed)$weight<-1
gCollapsed<-simplify(gCollapsed,remove.multiple=TRUE,edge.attr.comb=list(weight="sum",source=function(x) sum(!duplicated(x)),target=function(x) sum(!duplicated(x))))
summary(gCollapsed)
V(gCollapsed)$label<-V(gCollapsed)$majLangAdj
V(gCollapsed)$name<-V(gCollapsed)$majLangAdj
V(gCollapsed)$langSize<-0
for (index in V(gCollapsed)) {
l<-V(gCollapsed)[index]$majLangAdj
V(gCollapsed)[index]$langSize<-langSizes[which(langList==l)]
}
V(gCollapsed)$langSizeLog<-log(V(gCollapsed)$langSize)
#New insertion 2013-12-19###################
#(graph without normalization / expected a la Wikipedia article (ultimately not used in published paper, but retained for reference)
E(gCollapsed)$mentionCount<-E(gCollapsed)$weight
E(gCollapsed)$weightLog<-log(E(gCollapsed)$source)
weightLogSD<-sd(E(gCollapsed)$weightLog)
weightLogMean<-mean(E(gCollapsed)$weightLog)
print(paste0("Mean: ",weightLogMean))
print(paste0("95% Mark: ",weightLogMean+(1.96*weightLogSD)))
#Add some columns for use in visualization
#Random bug, igraph doesn't write boolean values to graph output files; so, use numbers
E(gCollapsed)$weightLog_geMean<-ifelse(E(gCollapsed)$weightLog>=weightLogMean,1,0)
E(gCollapsed)$weightLog_ge1SD<-ifelse(E(gCollapsed)$weightLog>=weightLogMean+weightLogSD,1,0)
E(gCollapsed)$weightLog_ge95<-ifelse(E(gCollapsed)$weightLog>=weightLogMean+(1.96*weightLogSD),1,0)
E(gCollapsed)$weightLog_diffMean<-E(gCollapsed)$weightLog-weightLogMean
#Repeat with percent of users in source lang mention users in target lang?
edges<-get.edges(gCollapsed, E(gCollapsed))
E(gCollapsed)$weightPercent<-0
for (edge in E(gCollapsed)) {
s=V(gCollapsed)[edges[[edge,1]]]$name
E(gCollapsed)[edge]$weightPercent<-E(gCollapsed)[edge]$source/langSizes[which(langList==s)]
}
E(gCollapsed)$weight<-E(gCollapsed)$weightLog
write.graph(gCollapsed,"gCollasped_nonnormalized_usercounts.graphml",format="graphml")
#List top edges by weightlog (but give actual weight)
tmp<-get.edgelist(gCollapsed)
tmp<-cbind(as.data.frame(tmp),E(gCollapsed)$weightLog,E(gCollapsed)$source,E(gCollapsed)$weightPercent)
names(tmp)<-c("source","target","weightLog","weight","weightPercent")
#head(tmp[order(tmp$weight,decreasing=TRUE), ],n=25)
head(tmp[order(tmp$weight,decreasing=TRUE), c("source","target","weight","weightPercent")],n=7)
#Filter to ge95 and drop isolates
gTmp<-gCollapsed
gTmp<-delete.edges(gTmp,E(gTmp)[E(gTmp)$weightLog_ge95!=1])
V(gTmp)$degree<-degree(gTmp)
gTmp<-delete.vertices(gTmp,V(gTmp)$degree==0)
#Repeat everything above with English removed?
################End insertion 2013-12-19##################################
#Add missing edges with weight of 0
edgelist<-as.data.frame(get.edgelist(gCollapsed,names=TRUE))
names(edgelist)=c("source","target")
edgelist$exists<-TRUE
N<-length(langList)*(length(langList)-1)
dfEdgesFull<-data.frame(source=rep("",N),target=rep("",N),stringsAsFactors=FALSE)
i<-0
for (source in langList) {
for (target in langList) {
if (target!=source) {
i<-i+1
dfEdgesFull[i, ]<-c(source,target)
}
}
}
dfEdgesFull<-merge(x=dfEdgesFull,y=edgelist,all=TRUE)
missing<-dfEdgesFull[is.na(dfEdgesFull$exists), ]
missing<-data.frame(source=missing$source,target=missing$target)
medges<-c()
for (index in seq(1,length(missing$source))) {
medges<-c(medges,as.character(missing[index, "source"]),
as.character(missing[index, "target"])
)
}
gCollapsed<-add.edges(gCollapsed,medges,attr=list(weight=0))
summary(gCollapsed)
#V(gCollapsed)$outDegree<-degree(gCollapsed,V(gCollapsed),mode="out") #Not out-degree, sum of weight on out edges
V(gCollapsed)$outWeight<-0
E(gCollapsed)$expected<-NA
edges<-get.edges(gCollapsed, E(gCollapsed))
for (edge in seq(1,length(edges)/2)) {
s<-edges[edge,1]
V(gCollapsed)[s]$outWeight<-V(gCollapsed)[s]$outWeight + E(gCollapsed)[edge]$weight
}
for (edge in seq(1,length(edges)/2)) {
s<-edges[edge,1]
t<-edges[edge,2]
sLang<-V(gCollapsed)[s]$majLangAdj
tLang<-V(gCollapsed)[t]$majLangAdj
#TODO: Demoninator needs to be reduced by the size of the source language!!!
#sLangSize<-langSizes[which(langList==sLang)]
#E(gCollapsed)[edge]$expected<-(langSizes[which(langList==tLang)]/(totalNodes-sLangSize)) * V(gCollapsed)[s]$outWeight
E(gCollapsed)[edge]$expected<-(langSizes[which(langList==tLang)]/totalNodes) * V(gCollapsed)[s]$outWeight
}
summary(E(gCollapsed)$weight)
summary(E(gCollapsed)$expected)
E(gCollapsed)$pError<-(E(gCollapsed)$weight-E(gCollapsed)$expected)/E(gCollapsed)$expected
E(gCollapsed)$pErrorZ<-(E(gCollapsed)$pError-mean(E(gCollapsed)$pError,na.rm=TRUE))/sd(E(gCollapsed)$pError,na.rm=TRUE)
E(gCollapsed)$diffPercent<-abs(E(gCollapsed)$weight-E(gCollapsed)$expected)/((E(gCollapsed)$weight+E(gCollapsed)$expected)/2)
E(gCollapsed)$zscore<-(E(gCollapsed)$weight-mean(E(gCollapsed)$weight))/sd(E(gCollapsed)$weight)
summary(E(gCollapsed)$pError)
summary(E(gCollapsed)$zscore)
write.graph(gCollapsed,"gCollapsed.graphml",format="graphml")
gCollapsedLargeLangs<-induced.subgraph(gCollapsed,V(gCollapsed)$majLangAdj %in% largeLangs)
E(gCollapsedLargeLangs)$pErrorZ<-(E(gCollapsedLargeLangs)$pError-mean(E(gCollapsedLargeLangs)$pError,na.rm=TRUE))/sd(E(gCollapsedLargeLangs)$pError,na.rm=TRUE)
E(gCollapsedLargeLangs)$zscore<-(E(gCollapsedLargeLangs)$weight-mean(E(gCollapsedLargeLangs)$weight))/sd(E(gCollapsedLargeLangs)$weight)
write.graph(gCollapsedLargeLangs,"gCollapsedLargeLangs.graphml",format="graphml")
edgeDataset<-function(graph) {
N<-length(E(graph))
dfEdges<-data.frame(source=rep("",N),target=rep("",N),pError=rep(NA,N),zscore=rep(NA,N),pErrorZ=rep(NA,N),stringsAsFactors=FALSE)
edges<-get.edges(graph, E(graph))
for (edge in seq(1,N)) {
s<-edges[edge,1]
t<-edges[edge,2]
sLang<-V(graph)[s]$majLangAdj
tLang<-V(graph)[t]$majLangAdj
pError<-E(graph)[edge]$pError
zscore<-E(graph)[edge]$zscore
pErrorZ<-E(graph)[edge]$pErrorZ
dfEdges[edge, ]<-list(sLang,tLang,pError,zscore,pErrorZ)
}
return(dfEdges)
}
#Table \label{tbl:langlang-more}
#head(dfEdges[order(dfEdges$pError,decreasing=TRUE),c("source","target","pError")],n=10)
dfEdgesLarge<-edgeDataset(gCollapsedLargeLangs)
dfEdgesLarge$pError100<-round(dfEdgesLarge$pError*100)
head(dfEdgesLarge[order(dfEdgesLarge$pError,decreasing=TRUE),c("source","target","pError100")],n=10)
#subset(dfEdges,pErrorZ>=1.96|pErrorZ<=1.96)
dfEdges<-edgeDataset(gCollapsed)#Is this really the same? Shouldn't it be different
dfEdgesSub<-subset(dfEdges,source%in%largeLangs & target%in%largeLangs)
head(dfEdgesSub[order(dfEdgesSub$pError,decreasing=TRUE),c("source","target","pError")],n=10)
rm(dfEdgesSub)
dfEdgesSub<-subset(dfEdges,zscore > 1.96| zscore < -1.96)
dfEdgesSub[order(dfEdgesSub$zscore,decreasing=TRUE),c("source","target","zscore")]
rm(dfEdgesSub)
dfEdges$pErrorZ<-(dfEdges$pError-mean(dfEdges$pError,na.rm=TRUE))/sd(dfEdges$pError,na.rm=TRUE)
dfEdgesSub<-subset(dfEdges,pErrorZ > 1.96 | pErrorZ < -1.96)
head(dfEdgesSub[order(dfEdgesSub$pErrorZ,decreasing=TRUE),c("source","target","pError","pErrorZ")],n=10)
rm(dfEdgesSub)
dfEdgesSub<-subset(dfEdges,source%in%largeLangs & target%in%largeLangs)
dfEdgesSub$pErrorZ<-(dfEdgesSub$pError-mean(dfEdgesSub$pError,na.rm=TRUE))/sd(dfEdgesSub$pError,na.rm=TRUE)
dfEdgesSub<-subset(dfEdgesSub,pErrorZ > 1.96 | pErrorZ < -1.96)
head(dfEdgesSub[order(dfEdgesSub$pErrorZ,decreasing=TRUE),c("source","target","pError","pErrorZ")],n=10)
rm(dfEdgesSub)
#N<-length(largeLangs)*(length(largeLangs)-1)
#dfEdgesFull<-data.frame(source=rep("",N),target=rep("",N),stringsAsFactors=FALSE)
#i<-0
#for (source in largeLangs) {
# for (target in largeLangs) {
# if (target!=source) {
# i<-i+1
# dfEdgesFull[i, ]<-c(source,target)
# }
# }
#}
#
#
#dfEdgesFull<-merge(x=dfEdgesFull,y=dfEdgesLarge,all=TRUE)
#dfEdgesFull[is.na(dfEdgesFull$pError), ]
#
##Any pairs with no edges in either direction? (\label{tbl:langlang-less})
#tmp<-dfEdgesFull[is.na(dfEdgesFull$pError), c("source","target")]
#dis<-data.frame(source=tmp$source,target=tmp$target)
#dis2<-data.frame(source=tmp$target,target=tmp$source)
#reallyDis<-merge(x=dis,y=dis2,all=FALSE)
#reallyDis
#New version of R lists includes edges with weight 0, this results in pError of -1
sum(E(gCollapsedLargeLangs)$weight==0)==sum(dfEdgesLarge$pError==-1)
sum(dfEdgesLarge$pError==-1)
#24 disconnected pairs
dfTmp<-dfEdgesLarge[dfEdgesLarge$pError==-1,c("source","target")]
dfTmp$mutual<-0
for (i in seq(1,length(dfTmp$mutual))) {
s<-dfTmp[i,"source"]
t<-dfTmp[i,"target"]
if (sum(dfTmp[dfTmp$source==t, ]$target==s)==1) {
dfTmp[i,"mutual"]<-1
}
}
missingTable<-function(df) {
str<-paste0(df$source," & ")
for (i in length(df$target)) {
if (df$mutual[i]) {
str<-paste0(str,"\\emphas{",df$target[i],"},")
} else {
str<-paste0(str,df$target[i],",")
}
}
print(str)
return(str)
}
ddply(dfTmp,.(source),missingTable)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment