Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Analysis of Twitter mentions/retweets network for #CHI2014 paper "Global Connectivity and Multilinguals in the Twitter Network". #R #igraph
#!/usr/bin/R
# Main analysis for "Global Connectivity and Multilinguals in the Twitter Network"
# paper. See http://www.scotthale.net/pubs/?chi2014 for details.
#
# Author: Scott A. Hale (http://www.scotthale.net/)
# License: GPLv2
# If you use this in support of an academic publication, please cite:
#
# Hale, S. A. (2014) Global Connectivity and Multilinguals in the Twitter Network.
# In Proceedings of the 2014 ACM Annual Conference on Human Factors in Computing Systems,
# ACM (Montreal, Canada).
#
# More details, related code, and the original academic paper using this code
# is available at http://www.scotthale.net/pubs/?chi2014
#
# Requires density_functions.R available at https://gist.github.com/computermacgyver/8416453
library(igraph)
library(ggplot2)
library(scales)
library(gridExtra)
library(plyr)
library(parallel)
library(foreach)
library(doParallel)
source("density_functions.R") #This file available at https://gist.github.com/computermacgyver/8416453
#if non-interactive and --save flag use the following to save the enviornment before quitting on an error
#options(error = quote(q("yes")))
NUM.CORES<-5
NUM.RUNS<-100
if (NUM.RUNS<NUM.CORES) {
NUM.CORES<-NUM.RUNS#Only use one core if we are running only once
}
#TODO: Consider average time between tweets -- remove lowest quartile as bots (?)
#TODO: Combine Indonesian and Malay together!
g<-read.graph("../../hadoop-twitter/graph_ge4_ge2-20_clean-msin_anon.graphml",format="graphml")
components<-decompose.graph(g,mode="weak", min.vertices = 100)
gLCC<-components[[1]]
rm(g)
pngWidth<-800
pngHeight<-700
svgWidth<-8
svgHeight<-8
summary(V(gLCC)$tweetCount)
# summary(V(gLCC)$tweetCount)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.000 1.000 2.000 3.615 4.000 343.000
#Filter to require at least one *in* edge
V(gLCC)$inDegree<-degree(gLCC, mode = "in")
summary(V(gLCC)$inDegree)
gMin5<-induced.subgraph(gLCC,which(V(gLCC)$inDegree>0 & V(gLCC)$tweetCountAdj>0 & V(gLCC)$tweetCount>=4))
components<-decompose.graph(gMin5,mode="weak", min.vertices = 100)
gMin5LCC<-components[[1]]
#rm(gLCC)
rm(gMin5)
#clusters<-edge.betweenness.community(gMin5LCC)
#sizes(clusters)
summary(gMin5LCC)
gMin5LCC<-simplify(gMin5LCC,remove.loops=TRUE,remove.multiple=TRUE,edge.attr.comb="sum")
summary(gMin5LCC)
gMin5LCCUndirected<-as.undirected(gMin5LCC, mode = "collapse")
summary(gMin5LCCUndirected)
summary(V(gMin5LCCUndirected)$majLangPercentAdj)
length(V(gMin5LCCUndirected)$majLangPercentAdj)
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj==1])
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj!=1])
length(V(gMin5LCCUndirected)$majLangPercentAdj[V(gMin5LCCUndirected)$majLangPercentAdj!=1])/length(V(gMin5LCCUndirected)$majLangPercentAdj)
##############################
## Summary ##
###############################
write.graph(gMin5LCC,"edges.list",format="edgelist")
write(V(gMin5LCC)$majLangAdj,"langclean.txt")
write.csv(V(gMin5LCC)$label,"usernames.txt",row.names=FALSE)#Untested
V(gMin5LCC)$inDegree<-degree(gMin5LCC, mode = "in")
V(gMin5LCC)$outDegree<-degree(gMin5LCC, mode = "out")
V(gMin5LCC)$constraint<-constraint(gMin5LCC)
#V(gMin5LCC)$betweenness<-betweenness(gMin5LCC)
V(gMin5LCC)$multilingual[V(gMin5LCC)$majLangPercentAdj!=1]<-"Multilingual"
V(gMin5LCC)$multilingual[V(gMin5LCC)$majLangPercentAdj==1]<-"Monolingual"
#V(gMin5LCCUndirected)$multilingual<-as.factor(V(gMin5LCCUndirected)$multilingual)
table(V(gMin5LCC)$multilingual)
length(V(gMin5LCC)$multilingual)
sumStats<-data.frame(multilingual=V(gMin5LCC)$multilingual,
tweetCount=V(gMin5LCC)$tweetCountAdj,
inDegree=V(gMin5LCC)$inDegree,
outDegree=V(gMin5LCC)$outDegree,
majLangCount=V(gMin5LCC)$majLangCountAdj,
constraint=V(gMin5LCC)$constraint#,betweenness=V(gMin5LCC)$betweenness
)
#TODO: Remove outliers?
svg("tweetCount.svg",width=4,height=4)
plot <- ggplot(sumStats,aes(x=multilingual,y=tweetCount))
plot <- plot + geom_boxplot() + scale_y_log10("Number of tweets")
plot <- plot + scale_x_discrete("") + theme_bw()
plot
dev.off()
svg("inDegree.svg",width=4,height=4)
plot <- ggplot(sumStats,aes(x=multilingual,y=inDegree))
plot <- plot + geom_boxplot() + scale_y_log10("In-degree")
plot <- plot + scale_x_discrete("") + theme_bw()
plot
dev.off()
svg("outDegree.svg",width=4,height=4)
plot <- ggplot(sumStats,aes(x=multilingual,y=outDegree))
plot <- plot + geom_boxplot() + scale_y_log10("Out-degree")
plot <- plot + scale_x_discrete("") + theme_bw()
plot
dev.off()
svg("majLangCount.svg",width=4,height=4)
plot <- ggplot(sumStats,aes(x=multilingual,y=majLangCount))
plot <- plot + geom_boxplot() + scale_y_log10("Number of tweets in majority language")
plot <- plot + scale_x_discrete("") + theme_bw()
plot
dev.off()
ggplot(sumStats,aes(x=tweetCount,color=multilingual)) + geom_density() + scale_x_log10()
#TODO: Different linetypes for split variable??? (e.g., dashed for mono, solid for multi)
pTweetCount<-density.log(sumStats,"tweetCount","multilingual",n=2048,adjust=8,title="Number of tweets")
pIndegree<-density.log(sumStats,"inDegree","multilingual",n=2048,adjust=8,title="In-degree/visibility")
pOutdegree<-density.log(sumStats,"outDegree","multilingual",n=2048,adjust=8,title="Out-degree/reach")
pConstraint<-density.log(sumStats,"constraint","multilingual",n=2048,adjust=8,title="Burt's constraint")
#pBetweenness<-density.log(sumStats,"betweenness","multilingual",n=2048,adjust=8,title="Betweenness")
tmp <- ggplot_gtable(ggplot_build(pTweetCount$plot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
leg <- tmp$grobs[[leg]]
lheight <- sum(leg$height)
svg("introSummary2.svg",width=12,height=5)
grid.arrange(
pTweetCount$plot+theme(legend.position="none")+scale_x_log10("Number of tweets",
breaks = c(1,10,100),
labels = trans_format("log10", math_format(10^.x))),
pOutdegree$plot+theme(legend.position="none")+scale_x_log10("Out-degree/reach",
breaks = c(1,10,100),
labels = trans_format("log10", math_format(10^.x))),
pIndegree$plot+theme(legend.position="none")+scale_x_log10("In-degree/visibility",
breaks = c(1,10,100,1000,10000),
labels = trans_format("log10", math_format(10^.x))),
#Row 2
textGrob(""), leg, ncol=3,ncol=1,heights=unit.c(unit(1, "npc") - lheight, lheight)
)
dev.off()
svg("introSummary.svg",width=12,height=5)
grid.arrange(
pTweetCount$plot+theme(legend.position="none"),
pOutdegree$plot+theme(legend.position="none"),
pIndegree$plot+theme(legend.position="none"),
#Row 2
textGrob(""), leg, ncol=3,ncol=1,heights=unit.c(unit(1, "npc") - lheight, lheight)
)
dev.off()
#plot <- ggplot(subset(sumStats,tweetCount>=4),aes(x=tweetCount,y=inDegree,group=multilingual,color=multilingual))
#plot <- plot + geom_point() + scale_y_log10("Number of tweets in majority language")
#plot <- plot + scale_x_log10("") + theme_bw()
#plot <- ggplot(subset(sumStats,multilingual=="Multilingual"),aes(x=majLangPercent)) + geom_hist()
summary(sumStats$tweetCount[sumStats$multilingual=="Monolingual"])
sd(sumStats$tweetCount[sumStats$multilingual=="Monolingual"])
summary(sumStats$tweetCount[sumStats$multilingual=="Multilingual"])
sd(sumStats$tweetCount[sumStats$multilingual=="Multilingual"])
t.test(sumStats$tweetCount[sumStats$multilingual=="Monolingual"],sumStats$tweetCount[sumStats$multilingual=="Multilingual"])
wilcox.test(sumStats$tweetCount~sumStats$multilingual)
kruskal.test(sumStats$tweetCount~sumStats$multilingual)
summary(sumStats$outDegree[sumStats$multilingual=="Monolingual"])
sd(sumStats$outDegree[sumStats$multilingual=="Monolingual"])
summary(sumStats$outDegree[sumStats$multilingual=="Multilingual"])
sd(sumStats$outDegree[sumStats$multilingual=="Multilingual"])
t.test(sumStats$outDegree[sumStats$multilingual=="Monolingual"],sumStats$outDegree[sumStats$multilingual=="Multilingual"])
wilcox.test(sumStats$outDegree~sumStats$multilingual)
kruskal.test(sumStats$outDegree~sumStats$multilingual)
summary(sumStats$inDegree[sumStats$multilingual=="Monolingual"])
sd(sumStats$inDegree[sumStats$multilingual=="Monolingual"])
summary(sumStats$inDegree[sumStats$multilingual=="Multilingual"])
sd(sumStats$inDegree[sumStats$multilingual=="Multilingual"])
t.test(sumStats$inDegree[sumStats$multilingual=="Monolingual"],sumStats$inDegree[sumStats$multilingual=="Multilingual"])
wilcox.test(sumStats$inDegree~sumStats$multilingual)
kruskal.test(sumStats$inDegree~sumStats$multilingual)
summary(sumStats$constraint[sumStats$multilingual=="Monolingual"])
summary(sumStats$constraint[sumStats$multilingual=="Multilingual"])
#summary(sumStats$betweenness[sumStats$multilingual=="Monolingual"])
#summary(sumStats$betweenness[sumStats$multilingual=="Multilingual"])
########################
## Node removal -- This analysis is not included in the published paper. See Node removal simple.
########################
#see games.c
#gRand<-rewire(gMin5LCCUndirected,mode="simple",niter=1000)
#OR
#Copy majLangPercentRank across based on degreeRank
#V(gMin5LCCUndirected)$degreeRank<-rank(V(gMin5LCCUndirected)$degree,ties.method="random")
#V(gRand)$degree<-degree(gRand)
#V(gRand)$degreeRank<-rank(V(gRand)$degree,ties.method="random")
#ranks<-V(gMin5LCCUndirected)$majLangPercentRank[order(V(gMin5LCCUndirected)$degreeRank)]
#V(gRand)$majLangPercentRank<-ranks[order(V(gRand)$degreeRank)]
V(gMin5LCCUndirected)$multilingual[V(gMin5LCCUndirected)$majLangPercentAdj!=1]<-"Multilingual"
V(gMin5LCCUndirected)$multilingual[V(gMin5LCCUndirected)$majLangPercentAdj==1]<-"Monolingual"
#TODO: Consider comparing to attack algorithm -- e.g. remove most central nodes first (by k-cores?)
# Betweenness centrality does not work as it changes as nodes are removed
# Could consdier comparing to removing highest degree first -- Did this (I think): multilinguals created less components than highest degree first
runNodeRemoval <- function(vals) {
# as many rows & cols as needed; don't know levels yet
GRAPHSIZE<-length(V(gMin5LCCUndirected))
#vals<-seq(0.01,0.99,0.03)#c(.01,.02,.03,.04,.05,.06,.07,.08,.09,.1,.2,.3,.4,.5,.6,.7,.8,.9)
#vals<-seq(0.01,0.1,0.005)
N <- length(vals)*3
df <- data.frame(val=rep(NA, N), components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), constraint=rep(NA, N), numEdges=rep(NA,N), numNodes=rep(NA,N), method=rep("", N), stringsAsFactors=FALSE)
i<-0
#Multilinguals first
for (x in vals) {
print(x)
y<-x*GRAPHSIZE
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank>=y)
#print(summary(tmp))
#clusters<-fastgreedy.community(tmp)
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multiFirst")
}
#Monolinguals first
for (x in vals) {
print(x)
y<-(1-x)*GRAPHSIZE
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank<=y)
#print(summary(tmp))
#clusters<-fastgreedy.community(tmp)
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "monoFirst")
}
#Randomly permute majLangPercent and repeat the above
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank)
for (x in vals) {
print(x)
y<-x*GRAPHSIZE
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$permuted>=y)
#print(summary(tmp))
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random")
}
#Compare to gRand
for (x in vals) {
print(x)
y<-x*GRAPHSIZE
tmp<-induced.subgraph(gRand,V(gRand)$majLangPercentRank>=y)
#print(summary(tmp))
#clusters<-fastgreedy.community(tmp)
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random-rewire")
}
#High degree first
#V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank)
#for (x in vals) {
# print(x)
# y<-x*GRAPHSIZE
# tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$degreeRank>=y)
# print(summary(tmp))
# components<-clusters(tmp,mode="weak")
# print(components$no)
# m<-max(components$csize)
# i<-i+1
# df[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), "highFirst")
#}
return(df)
}
if (NUM.CORES>1) {
cl <- makeCluster(NUM.CORES)
registerDoParallel(cl, cores = NUM.CORES)
}
dfAvg <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"),
.combine = rbind) %dopar% {
print(paste0("Run: ",run))
GRAPHSIZE<-length(V(gMin5LCCUndirected))
V(gMin5LCCUndirected)$degree<-degree(gMin5LCCUndirected, mode = "all")
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random")
gRand<-degree.sequence.game(degree(gMin5LCCUndirected),method="vl")
V(gRand)$majLangPercentRank<-V(gMin5LCCUndirected)$majLangPercentRank
df<-runNodeRemoval(seq(0.01,0.99,0.01))
df$edgesRemoved<-ecount(gMin5LCCUndirected)-as.numeric(df$numEdges)
write.csv(df,paste0("output_df_randomOrder_withCounts_",run,".csv"))
return(df)
}
warnings()
if (NUM.CORES>1) {
stopCluster(cl)
}
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$multilingual=="Monolingual")
1-ecount(tmp)/ecount(gMin5LCCUndirected)
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentAdj>=.6)
1-ecount(tmp)/ecount(gMin5LCCUndirected)
sum(V(gMin5LCCUndirected)$multilingual=="Multilingual")/vcount(gMin5LCCUndirected)
sum(V(gMin5LCCUndirected)$majLangPercentAdj<.6)/vcount(gMin5LCCUndirected)
#dfZoom<-runNodeRemoval(seq(0.01,0.2,0.01))
#write.csv(dfZoom,"output_dfZoom.csv")
#df<-read.csv("output2.csv")
df<-dfAvg
df$val<-as.numeric(df$val)
df$components<-as.numeric(df$components)
df$lccSize<-as.numeric(df$lccSize)
df$avgSize<-as.numeric(df$avgSize)
df$constraint<-as.numeric(df$constraint)
df$numEdges<-as.numeric(df$numEdges)
df$numNodes<-as.numeric(df$numNodes)
df$edgesRemoved<-as.numeric(df$edgesRemoved)
df<-ddply(df,.(val,method),summarize,
components=mean(components),lccSize=mean(lccSize),
avgSize=mean(avgSize),constraint=mean(constraint),
numEdges=mean(numEdges),numNodes=mean(numNodes),edgesRemoved=mean(edgesRemoved))
write.csv(df,"output_df_randomOrder_withCounts_avg100.csv")
png("no_components.png",width=pngWidth,height=pngHeight)
plot<-ggplot(df,aes(x=val,y=components,color=method,group=method))
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent)
plot
dev.off()
png("lccSize.png",width=pngWidth,height=pngHeight)
plot<-ggplot(df,aes(x=val,y=lccSize,color=method,group=method))
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent)
plot
dev.off()
png("avgSize.png",width=pngWidth,height=pngHeight)
plot<-ggplot(df,aes(x=val,y=avgSize,color=method,group=method))
plot<-plot + geom_line() + scale_color_brewer(type="qual") + scale_x_continuous(labels=percent)
plot
dev.off()
png("constraint.png",width=pngWidth,height=pngHeight)
plot<-ggplot(df,aes(x=val,y=constraint,color=method,group=method))
plot<-plot + geom_line()
plot
dev.off()
########################
## Node removal simple -- This is included
########################
runNodeRemovalSimple <- function(gMin5LCCUndirected,gRand) {
nodesToRemove<-length(V(gMin5LCCUndirected)[V(gMin5LCCUndirected)$majLangPercentAdj<1])
GRAPHSIZE<-length(V(gMin5LCCUndirected))
N <- 4
df <- data.frame(components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), constraint=rep(NA, N), numEdges=rep(NA,N), numNodes=rep(NA,N), method=rep("", N), stringsAsFactors=FALSE)
i<-0
print("Multilinguals (all removed)")
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentAdj==1)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multi")
print("Monolinguals (random subset)")
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangPercentRank<=(GRAPHSIZE-nodesToRemove))
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "mono")
#Randomly permute majLangPercent and repeat the above
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRank)
print("Random subset")
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$permuted>nodesToRemove)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "random")
print("Multilinguals (all, edge rewiring)")
tmp<-induced.subgraph(gRand,V(gRand)$majLangPercentAdj==1)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "multiRewire")
print("Highest degree subset")
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$degreeRank>nodesToRemove)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
df[i, ] <- list(components$no, m, mean(components$csize[components$csize!=m]), mean(constraint(tmp),na.rm = TRUE), ecount(tmp), vcount(tmp), "highDegree")
return(df)
}
if (NUM.CORES>1) {
cl <- makeCluster(NUM.CORES)
registerDoParallel(cl, cores = NUM.CORES)
}
dfNodeRemoval <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"),
.combine = rbind) %dopar% {
print(paste0("Run: ",run))
GRAPHSIZE<-length(V(gMin5LCCUndirected))
V(gMin5LCCUndirected)$degree<-degree(gMin5LCCUndirected, mode = "all")
V(gMin5LCCUndirected)$degreeRank<-GRAPHSIZE-rank(V(gMin5LCCUndirected)$degree,ties.method="random")
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random")
gRand<-degree.sequence.game(degree(gMin5LCCUndirected),method="vl")
V(gRand)$majLangPercentRank<-V(gMin5LCCUndirected)$majLangPercentRank
V(gRand)$majLangPercentAdj<-V(gMin5LCCUndirected)$majLangPercentAdj
df<-runNodeRemovalSimple(gMin5LCCUndirected,gRand)
df$edgesRemoved<-ecount(gMin5LCCUndirected)-as.numeric(df$numEdges)
df$run<-run
write.csv(df,paste0("output_dfNodeRemovalSimple_",run,".csv"))
return(df)
}
write.csv(dfNodeRemoval,"dfNodeRemovalSimple.csv")
warnings()
if (NUM.CORES>1) {
stopCluster(cl)
}
conf95<-function(df,var) {
m<-mean(df[,var])
s<-sd(df[,var])
tmp<-data.frame(
m,
s,
m+(1.96*s),
m-(1.96*s),
max(df[,var]),
min(df[,var])
)
eval(names(tmp)<-c(
paste0(var,"Mean"),paste0(var,"SD"),
paste0(var,"95Max"),paste0(var,"95Min"),
paste0(var,"Max"),paste0(var,"Min")
))
return(tmp)
}
meanSD<-function(df) {
tmp<-data.frame(
lccSize=mean(df$lccSize),lccSizeSD=sd(df$lccSize),
avgSize=mean(df$avgSize),avgSizeSD=sd(df$avgSize),
components=mean(df$components),componentsSD=sd(df$components)
)
lccSize<-conf95(df,"lccSize")
avgSize<-conf95(df,"avgSize")
components<-conf95(df,"components")
#print(tmp)
return(cbind(lccSize,avgSize,components))
}
#dfNodeRemoval<-read.csv("dfNodeRemovalSimple.csv")
dfNodeRemovalAvg<-ddply(dfNodeRemoval,.(method),meanSD)
dfNodeRemovalAvg
dfNodeRemovalAvg<-subset(dfNodeRemovalAvg,method!="highDegree")
#bar chart
ggplot(dfNodeRemovalAvg,aes(x=method,y=componentsMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=components95Max,ymin=components95Min)) + theme_bw()
ggplot(dfNodeRemovalAvg,aes(x=method,y=lccSizeMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=lccSize95Max,ymin=lccSize95Min)) + theme_bw()
ggplot(dfNodeRemovalAvg,aes(x=method,y=avgSizeMean)) + geom_bar(stat="identity",alpha=0.75) + geom_errorbar(aes(ymax=avgSize95Max,ymin=avgSize95Min)) + theme_bw()
#boxplots
dfNodeRemoval<-subset(dfNodeRemoval,method!="highDegree")
dfNodeRemoval$label<-factor(dfNodeRemoval$method,levels=c("multi","mono","random","multiRewire"),labels=c("Multilinguals","\nMonolinguals","Random","\nMultilinguals\n(edges rewired)"))
dfNodeRemovalAvg$label<-factor(dfNodeRemovalAvg$method,levels=c("multi","mono","random","multiRewire"),labels=c("Multilinguals","\nMonolinguals","Random","\nMultilinguals\n(edges rewired)"))
#xscale<-scale_x_discrete("",
# breaks=c("multi","mono","multiRewire","random"),
# labels=c("Multilinguals","\nMonolinguals","\nMultilinguals\n(Edges rewired)","Random")
#)
xscale<-scale_x_discrete("")
cPlot<-ggplot(dfNodeRemoval,aes(x=label,y=components))+geom_boxplot()+theme_bw()
cPlot<-cPlot+xscale+scale_y_continuous("Number of components",labels=comma)
cPlot<-cPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=componentsMean),shape=3)#3=+, 4=*
lccPlot<-ggplot(dfNodeRemoval,aes(x=label,y=lccSize))+geom_boxplot()+theme_bw()
lccPlot<-lccPlot+xscale+scale_y_continuous("Size of LCC",labels=comma)
lccPlot<-lccPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=lccSizeMean),shape=3)
avgPlot<-ggplot(dfNodeRemoval,aes(x=label,y=avgSize))+geom_boxplot()+theme_bw()
avgPlot<-avgPlot+xscale+scale_y_continuous("Average component size (not including LCC)",labels=comma)
avgPlot<-avgPlot+geom_point(data=dfNodeRemovalAvg,aes(x=label,y=avgSizeMean),shape=3)
svg("nodeRemovalSimple.svg",width=12,height=5)
grid.arrange(lccPlot,cPlot,avgPlot,nrow=1,ncol=3)
dev.off()
#Addl -- compare % left in LCC to having removed all speakers of a given language (dfRemoveLangs)
nodesToRemove<-length(V(gMin5LCCUndirected)[V(gMin5LCCUndirected)$majLangPercentAdj<1])
dfNodeRemovalAvg$lccPercent<-1-(dfNodeRemovalAvg$lccSizeMean/(length(V(gMin5LCC))-nodesToRemove))
dfNodeRemovalAvg[,c("method","lccPercent")]
dfRemoveLangs[dfRemoveLangs=="en","lccFrac"]
###############################
## Language analysis - remove nodes from one language at a time
##############################
#\label{tbl:topLangs}
#TOP Langs: Table tbl:topLangs
sort(table(V(gMin5LCCUndirected)$majLangAdj))
langs<-c("en","ja","pt","ms","es","nl","ko","th")
#Avg. tweets per lang
dfTmp<-data.frame(majLangAdj=V(gMin5LCCUndirected)$majLangAdj,tweetCount=V(gMin5LCCUndirected)$tweetCount)
dfTmp<-ddply(dfTmp,.(majLangAdj),summarize,meanTweets=round(mean(tweetCount),2),tweetSD=round(sd(tweetCount),2),tweetMin=min(tweetCount),tweetMax=max(tweetCount),tweetMed=median(tweetCount),sumTweets=sum(tweetCount),numUsers=length(tweetCount))
dfTmp[order(dfTmp$numUsers,decreasing=TRUE),c("majLangAdj","numUsers","meanTweets","tweetSD","tweetMin","tweetMax","tweetMed")]
print(dfTmp[order(dfTmp$numUsers,decreasing=TRUE),c("majLangAdj","numUsers","meanTweets","tweetSD")],row.names=FALSE)
GRAPHSIZE<-length(V(gMin5LCCUndirected))
vals<-seq(0.01,0.3,0.01)
N <- length(vals)*(length(langs)+1)
if (NUM.CORES>1) {
cl <- makeCluster(NUM.CORES)
registerDoParallel(cl, cores = NUM.CORES)
}
dfLang100 <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"),
.combine = rbind) %dopar% {
dfLang <- data.frame(val=rep(NA, N), components=rep(NA, N), lccSize=rep(NA, N), avgSize=rep(NA, N), lang=rep("", N),stringsAsFactors=FALSE)
i<-0
print(paste0("Run: ",run))
V(gMin5LCCUndirected)$majLangPercentRank<-rank(V(gMin5LCCUndirected)$majLangPercentAdj,ties.method="random")
V(gMin5LCCUndirected)$majLangPercentRankRev<-GRAPHSIZE-V(gMin5LCCUndirected)$majLangPercentRank
for (lang in langs) {
ranks<-sort(V(gMin5LCCUndirected)$majLangPercentRankRev[V(gMin5LCCUndirected)$majLangAdj==lang])
for (x in vals) {
print(paste(lang,x))
y<-x*GRAPHSIZE
y<-ranks[min(y,length(ranks))]
tmp<-induced.subgraph(gMin5LCCUndirected,
V(gMin5LCCUndirected)$majLangAdj!=lang | V(gMin5LCCUndirected)$majLangPercentRankRev>=y)
#print(summary(tmp))
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
dfLang[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), lang)
}
}
#Randomly permute majLangPercentRev and repeat the above
V(gMin5LCCUndirected)$permuted<-sample(V(gMin5LCCUndirected)$majLangPercentRankRev)
for (x in vals) {
print(paste("random",x))
y<-x*GRAPHSIZE
tmp<-induced.subgraph(gMin5LCCUndirected,
V(gMin5LCCUndirected)$permuted>=y)
#print(summary(tmp))
components<-clusters(tmp,mode="weak")
#print(components$no)
m<-max(components$csize)
i<-i+1
dfLang[i, ] <- c(x, components$no, m, mean(components$csize[components$csize!=m]), "random")
}
write.csv(dfLang,paste0("dfLang_run_",run,".csv"))
return(dfLang)
}
dfLang<-dfLang100
dfLang$val<-as.numeric(dfLang$val)
dfLang$components<-as.numeric(dfLang$components)
dfLang$lccSize<-as.numeric(dfLang$lccSize)
dfLang$avgSize<-as.numeric(dfLang$avgSize)
write.csv(dfLang,"dfLangAllRuns.csv")
dfLang<-ddply(dfLang,.(val,lang),summarize,
components=mean(components),lccSize=mean(lccSize),
avgSize=mean(avgSize))
write.csv(dfLang,"dfLang_avg100.csv")
warnings()
if (NUM.CORES>1) {
stopCluster(cl)
}
#scale_color_brewer(type="qual")
dfLang<-subset(dfLang,lang %in% langs)
dfLang$lang[dfLang$lang=="random"]<-"Random"
svg("lang_no_components.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfLang,aes(x=val,y=components,color=lang,group=lang))
plot<-plot + geom_point() + geom_line(size=1) + scale_color_discrete("Language")#,type="qual")
plot<-plot + scale_x_continuous('% of users removed',labels=percent)
plot<-plot + scale_y_continuous("Number of components",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot<-plot + geom_text(aes(x=.31,y=components,color=lang,label=lang),data=subset(dfLang,val==0.3))
plot
dev.off()
dfEnding<-subset(dfLang,val==.3)
dfEnding[order(dfEnding$components), ]
svg("lang_lccSize.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfLang,aes(x=val,y=lccSize,color=lang,group=lang))
plot<-plot + geom_point() + geom_line(size=1) + scale_color_discrete("Language")#,type="qual")
plot<-plot + scale_x_continuous('% of users removed',labels=percent)
plot<-plot + scale_y_continuous("Size of largest connected component",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16))
plot
dev.off()
#TODO: Look at direct.label
table(dfLang$lang[dfLang$val==.3],dfLang$components[dfLang$val==.3])
#Ending num. components for each language (val==.3)
#en 48,190
#es 1,923
#in 6,864
#ja 385
#ko 184
#ms 8,626
#nl 965
#pt 2,211
#random 67,206
V(gMin5LCCUndirected)$constraint<-constraint(gMin5LCCUndirected)
dfGraph<-data.frame(
majLangPercentAdj=V(gMin5LCCUndirected)$majLangPercentAdj,
tweetCount=V(gMin5LCCUndirected)$tweetCountAdj,
inDegree=V(gMin5LCCUndirected)$inDegree,
constraint=V(gMin5LCCUndirected)$constraint
)
#More graphs
png("tweetCount_majLangPercent.png",width=pngWidth,height=pngHeight)
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=tweetCount)) + geom_point()
plot
dev.off()
png("inDegree_majLangPercent.png",width=pngWidth,height=pngHeight)
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=inDegree)) + geom_point()
plot
dev.off()
png("constraint_majLangPercent.png",width=pngWidth,height=pngHeight)
plot <- ggplot(dfGraph,aes(x=majLangPercentAdj,y=constraint)) + geom_point()
plot
dev.off()
V(gMin5LCCUndirected)$label[V(gMin5LCCUndirected)$inDegree>40000]#justinbieber
V(gMin5LCCUndirected)$label[V(gMin5LCCUndirected)$inDegree>10000]#SlGNO
#Remove nodes from lang and calculate fraction of reminaing nodes in LCC
langs<-as.data.frame(table(V(gMin5LCCUndirected)$majLangAdj))
langs<-subset(langs,langs$Freq>1000)$Var1
N<-length(langs)
dfRemoveLangs<-data.frame(lang=rep("",N),components=rep(NA,N),lccSize=rep(NA,N),
numNodes=rep(NA,N),num2=rep(NA,N),stringsAsFactors=FALSE)
i<-0
for (lang in langs) {
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangAdj!=lang)
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
i<-i+1
dfRemoveLangs[i, ] <- list(lang, components$no, m, length(V(tmp)), sum(components$csize[components$csize!=m]))
}
dfRemoveLangs$numNodes==dfRemoveLangs$num2+dfRemoveLangs$lccSize
dfRemoveLangs$lccFrac<-(dfRemoveLangs$numNodes-dfRemoveLangs$lccSize)/dfRemoveLangs$numNodes
svg("lang_removed.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfRemoveLangs,aes(x=lang,y=lccFrac))
plot<-plot + geom_bar(stat="identity")
plot<-plot + scale_y_continuous('% of nodes not in largest-connected component',labels=percent)
plot<-plot + scale_x_discrete("Language removed")
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
tmp<-induced.subgraph(gMin5LCCUndirected,V(gMin5LCCUndirected)$majLangAdj!="ms" & V(gMin5LCCUndirected)$majLangAdj!="in")
components<-clusters(tmp,mode="weak")
m<-max(components$csize)
(length(V(tmp))-m)/length(V(tmp))
###################################################
## Q1: Is language homophilous? ##
###################################################
#take largest *strongly* connected component?
#labelPropComm<-label.propagation.community(gMin5LCCUndirected)
#write.csv(labelPropComm$membership,"labelPropCommMembership.csv")
commsFile<-"iter72memberships_smart.txt" #This file comes from the label propagation code in Java
if (!is.na(commsFile)){
memberships<-read.csv(commsFile,header=FALSE,sep=" ")
#Begin Untested code
tmp<-read.csv("usernames.txt",header=FALSE)
sum(tmp!=V(gMin5LCCUndirected)$label)#This should be 0, else the lists are malaligned!
#End untested code
modularity(gMin5LCCUndirected,memberships$V2)
#[1] 0.7471879
#in-ms: 0.8054294
N <- max(memberships$V2)
dfComms <- data.frame(num=seq(1, N), size=rep(0, N), numLangs=rep(0, N), majLangCount=rep(0,N), majLang=rep("", N),stringsAsFactors=FALSE)
for (comm in seq(1,N)) {
size<-length(memberships$V2[memberships$V2==comm])
langs<-V(gMin5LCCUndirected)$majLangAdj[memberships$V2==comm]
numLangs<-length(table(langs))
majLang <- unique(langs)
majLang <- majLang[which.max(tabulate(match(langs,majLang)))]
majLangCount<-sum(langs==majLang)
dfComms[comm, ] <- list(comm, size, numLangs, majLangCount, majLang)
}
write.csv(dfComms,"dfCommunities.csv")
#dfComms<-read.csv("community_info_iter36.csv")
#dfComms$label<-dfComms$label+1
sum(dfComms$numLangs==1)
sum(dfComms$numLangs==1)/length(dfComms$numLangs) #A large number of components have one language only
sum(dfComms$size[dfComms$numLangs==1])/sum(dfComms$size) #But these components represent a small number of users
#Five large components have 61% of users, #\label{tbl:topCommunities}
sum(dfComms$size[dfComms$size>10^4])/sum(dfComms$size)
dfComms[dfComms$size>10^4, ]
top<-dfComms[dfComms$size>10^4, ]
top$majLangPercent<-round((top$majLangCount/top$size)*100,1)
top[order(top$size,decreasing=TRUE), c("majLang","majLangPercent","numLangs","size")]
head(dfComms[order(dfComms$size,decreasing=TRUE), c("majLang","majLangPercent","numLangs","size")],n=25)
sort(table(dfComms$majLang))
#Is 48 the total number of langs in the network?
length(unique(V(gMin5LCCUndirected)$majLangAdj))
dfComms$majLangPercent<-dfComms$majLangCount/dfComms$size
summary(dfComms)
with(dfComms,plot(log(size),majLangPercent))
max(dfComms$size)
dfComms$label[dfComms$size==max(dfComms$size)]
#label 16 is quite large
svg("communities_size_percent.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,color=majLang))
plot<-plot+geom_point()
plot<- plot + scale_x_log10("Cluster size",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("communities_size_percent_labels.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,label=majLang))
plot<-plot+geom_text()
plot<- plot + scale_x_log10("Cluster size",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("communities_size_percent_labels_partial.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=size,y=majLangPercent,color=majLang))
plot<-plot+geom_point()
plot<-plot+geom_text(data=subset(dfComms,size>10^4),aes(x=size,y=majLangPercent+0.025,label=majLang))
plot<- plot + scale_x_log10("Cluster size",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified in dominant language",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
dfComms[dfComms$size>10^4, ]
#hist(log(dfComms$size))
svg("community_size.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=size))
plot<-plot+geom_histogram(binwidth=0.15)
plot<- plot + scale_x_log10("Cluster size",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Frequency",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("community_numLangs_hist.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfComms,aes(x=numLangs))
plot<-plot+geom_histogram(binwidth=1)
plot<- plot + scale_x_continuous("Number of languages per cluster")
#plot<-plot + scale_y_continuous("Percent of users in communities with this number of languages or fewer (cdf)",labels=percent)
plot<-plot + scale_y_continuous("Frequency",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("community_numLangs_hist_zoom.svg",width=svgWidth,height=svgHeight)
zplot<-ggplot(subset(dfComms,numLangs<=5),aes(x=as.factor(numLangs)))
zplot<-zplot+geom_histogram(binwidth=1)
zplot<-zplot + scale_x_discrete("Number of languages per cluster")
zplot<-zplot + scale_y_continuous("Frequency",labels=comma)
zplot<-zplot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
zplot
dev.off()
#dfDensity<-densityDataframe(dfComms,"numLangs",NA,n=2048,adjust=8)
#dplot <- ggplot(dfDensity,aes(x=x10,y=y)) + geom_path()
dfTmp<-as.data.frame(table(dfComms$numLangs))
insert<-ggplot(dfTmp,aes(x=as.numeric(Var1),y=Freq))+geom_point()+
scale_y_log10("",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))+
scale_x_continuous("")+ theme_bw()
svg("community_numLangs_hist_insert.svg",width=svgWidth,height=svgHeight)
print(zplot)
#print(plot+scale_x_continuous("")+scale_y_continuous("",labels=comma),
print(insert,
vp=viewport(width=0.6,height=0.6,x=1,y=1,just=c("right","top")))
dev.off()
N <- max(memberships$V2)
if (NUM.CORES>1) {
cl <- makeCluster(NUM.CORES)
registerDoParallel(cl, cores = NUM.CORES)
}
dfCommsShuffled100 <- foreach(run = 1:NUM.RUNS, .packages = c("igraph"),
.combine = rbind) %dopar% {
dfCommsShuffled <- data.frame(num=seq(1, N), size=rep(0, N), numLangs=rep(0, N), majLangCount=rep(0,N), majLang=rep("", N),stringsAsFactors=FALSE)
print(paste0("Run: ",run))
V(gMin5LCCUndirected)$majLangAdjShuffled<-sample(V(gMin5LCCUndirected)$majLangAdj)#Random permutation
for (comm in seq(1,N)) {
size<-length(memberships$V2[memberships$V2==comm])
langs<-V(gMin5LCCUndirected)$majLangAdjShuffled[memberships$V2==comm]
numLangs<-length(table(langs))
majLang <- unique(langs)
majLang <- majLang[which.max(tabulate(match(langs,majLang)))]
majLangCount<-sum(langs==majLang)
dfCommsShuffled[comm, ] <- list(comm, size, numLangs, majLangCount, majLang)
}
write.csv(dfCommsShuffled,paste0("dfCommsShuffled_run_",run,".csv"))
return(dfCommsShuffled)
}
#Collapse dfComms
write.csv(dfCommsShuffled100,"dfCommsShuffled.csv")
dfCommsShuffled<-ddply(dfCommsShuffled100,.(num),summarize,
size=mean(size),sizeSD=sd(size),
numLangs=mean(numLangs),numLangsSD=sd(numLangs),
majLangCount=mean(majLangCount),majLangCountSD=sd(majLangCount),
majLang="skip")
dfCommsShuffled$majLangPercent<-dfCommsShuffled$majLangCount/dfCommsShuffled$size
warnings()
if (NUM.CORES>1) {
stopCluster(cl)
}
svg("community_numLangs_hist_shuffled.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfCommsShuffled,aes(x=numLangs))
plot<-plot+geom_histogram(binwidth=1)
plot<- plot + scale_x_continuous("Number of languages per community (when language labels are randomly shuffled)")
plot<-plot + scale_y_continuous("Frequency",labels=comma)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
summary(dfComms$numLangs)
summary(dfCommsShuffled$numLangs)
length(dfComms$numLangs[dfComms$numLangs==1])
length(dfCommsShuffled$numLangs[dfCommsShuffled$numLangs==1])
dfComms$shuffled<-"Observed data"
dfCommsShuffled$shuffled<-"Language labels shuffled"
dfTmp<-rbind(dfComms[,c("size","numLangs","majLangCount","shuffled")],dfCommsShuffled[,c("size","numLangs","majLangCount","shuffled")])
dfDensity<-densityDataframe(dfTmp,"numLangs","shuffled",n=2048,adjust=6)
dfDensitySum<-densitySummary(dfTmp,"numLangs","shuffled")
pNumLangs<-densityPlot(dfDensity,dfDensitySum,"Number of languages per community")
svg("community_numLangs_density.svg",width=svgWidth,height=svgHeight)
pNumLangs
dev.off()
#difference of means t-test
t.test(x=dfComms$numLangs,y=dfCommsShuffled$numLangs,alternative="two.sided")
# p-value < 2.2e-16
}
#spinglassComm<-spinglass.community(gMin5LCCUndirected, weights=NA)
#write.csv(spinglassComm$membership,"spinglassCommMembership.csv")
modularity(gMin5LCCUndirected,as.factor(V(gMin5LCCUndirected)$majLangAdj))
#[1] 0.6489853 ge2?
#[1] 0.6291782 ge4_ge2-20
#[1] 0.6671592 ge4_ge2-20_inms
###################################################
## Q3: Lang outwardness ##
###################################################
#TODO: Include all languages????
sort(table(V(gMin5LCCUndirected)$majLangAdj))
#langList<-sort(c("in","es","ms","pt","ja","en","ru","de","tr","it","fil","fr","ar","th","ko","nl"))
#gLangs<-induced.subgraph(gMin5LCC,V(gMin5LCC)$majLangAdj %in% langList)
langList<-sort(unique(V(gMin5LCCUndirected)$majLangAdj))#Must be in same order as langSizes!!!
gLangs<-gMin5LCC
sort(table(V(gLangs)$majLangAdj))
#langs <- unique(V(gLangs)$majLangAdj)
langSizes<-table(V(gLangs)$majLangAdj)
totalNodes<-length(V(gLangs)) #Check if this needs $something?
totalEdges<-length(E(gLangs)) #Check if this needs $weight?
#Check distribution of majLangPercentAdj for different languages
library(plyr)
dfMultilingual<-data.frame(lang=V(gMin5LCCUndirected)$majLangAdj,percent=V(gMin5LCCUndirected)$majLangPercentAdj)
dfMultilingual$lang<-as.factor(dfMultilingual$lang)
dfMultiCollapse<-ddply(dfMultilingual,.(lang),function(df) {c(length(df$percent),min(df$percent),max(df$percent),mean(df$percent),median(df$percent),length(df$percent[df$percent!=1]))})
names(dfMultiCollapse)<-c("lang","nodeCount","min","max","mean","median","multilingualCount")
dfMultiCollapse$multiPercent<-dfMultiCollapse$multilingualCount/dfMultiCollapse$nodeCount
largeLangs<-as.character(dfMultiCollapse$lang[dfMultiCollapse$nodeCount>1000])
plot(log(dfMultiCollapse$nodeCount),dfMultiCollapse$multiPercent)
dfMultiCollapse[order(dfMultiCollapse$nodeCount),c("lang","nodeCount","multiPercent")]
dfMultiCollapseSub<-subset(dfMultiCollapse,nodeCount>1000)
with(dfMultiCollapseSub,plot(log(nodeCount),multiPercent))
svg("multilingualism_ge1000.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(subset(dfMultiCollapse,nodeCount>1000),aes(x=nodeCount,y=multiPercent))
plot<-plot+geom_point()
plot<- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("multilingualism.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent))
plot<-plot+geom_point()
plot<- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("multilingualism-labels.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent,label=lang))
plot<-plot+geom_text()#Use size= here to control size
plot<- plot + scale_x_log10("Language size (number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot
dev.off()
svg("multilingualism-labels_and_point.svg",width=svgWidth,height=svgHeight)
plot<-ggplot(dfMultiCollapse,aes(x=nodeCount,y=multiPercent,label=lang)) + geom_point()
plot<-plot+geom_text(aes(y=multiPercent+0.02))#Use size= here to control size
plot<- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot<-plot + scale_y_continuous("Percentage of users classified as multiligual",labels=percent)
plot<-plot + theme_bw() +
theme(legend.title=element_text(size=18),legend.text=element_text(size=16),
axis.title.x=element_text(size=18),axis.text.x=element_text(size=16),
axis.title.y=element_text(size=18),axis.text.y=element_text(size=16),legend.position="none")
plot#+stat_smooth(method="loess")
#method="lm","gam"/library(mgcv) (http://www.inside-r.org/r-doc/mgcv/gam)
dev.off()
#Correlation between size/mutlilingualism?
cor(dfMultiCollapse$nodeCount,dfMultiCollapse$multiPercent)
with(dfMultiCollapse,cor(log(nodeCount),multiPercent))
with(subset(dfMultiCollapse,nodeCount<=1000),cor(log(nodeCount),multiPercent))
with(subset(dfMultiCollapse,nodeCount<=10^5),cor(log(nodeCount),multiPercent))
coef(lm(log(nodeCount) ~ multiPercent, data = dfMultiCollapse))
cor.test(dfMultiCollapse$nodeCount,dfMultiCollapse$multiPercent,alternative="less",conf.level=0.95)
cor.test(log(dfMultiCollapse$nodeCount),dfMultiCollapse$multiPercent,alternative="less",conf.level=0.95)
# method = c("pearson", "kendall", "spearman")
#dfMultiCollapseSub[order(dfMultiCollapseSub$multiPercent),c("lang","nodeCount","multiPercent")]
rm(dfMultiCollapseSub)
dfMultiCollapse[order(dfMultiCollapse$multiPercent),c("lang","nodeCount","multiPercent")]
dfMultiCollapse[order(dfMultiCollapse$nodeCount),c("lang","nodeCount","multiPercent")]
N<-length(langList)
dfOut <- data.frame(lang=rep("", N), nodeCount=rep(0, N), edgeCount=rep(0, N),
inLangEdges=rep(0, N),expInLangEdges=rep(0, N),
stringsAsFactors=FALSE)
i<-0
for (lang in langList) {
#langEdges<-length(
# E(gLangs)[E(gLangs)$source %in%
# V(gLangs)[V(gLangs)$majLangAdj==lang]
# ])
langEdges<-sum(degree(gLangs,V(gLangs)$majLangAdj==lang,mode="out"))
inGroupEdges<-
length(E(induced.subgraph(gLangs,V(gLangs)$majLangAdj==lang)))
#bwGroupEdges<-langEdges-inGroupEdges
expInGroupEdges<-(as.numeric(langSizes[lang])/totalNodes)*langEdges
i<-i+1
dfOut[i, ] <- list(lang,as.numeric(langSizes[lang]),langEdges,inGroupEdges,expInGroupEdges)
}
#dfOut$nodeCount<-as.numeric(dfOut$nodeCount)
#dfOut$edgeCount<-as.numeric(dfOut$edgeCount)
#dfOut$inLangEdges<-as.numeric(dfOut$inLangEdges)
#dfOut$expInLangEdges<-as.numeric(dfOut$expInLangEdges)
dfOut$diff<-dfOut$inLangEdges-dfOut$expInLangEdges
dfOut$fracIn<-dfOut$inLangEdges/dfOut$edgeCount
dfOut$expFrac<-dfOut$expInLangEdges/dfOut$edgeCount
dfOut$nodeFrac<-dfOut$nodeCount/totalNodes
dfOut$pError<-dfOut$diff/dfOut$expInLangEdges
dfOut$diffPercent<-abs(dfOut$diff)/((dfOut$inLangEdges+dfOut$expInLangEdges)/2)
dfOut$logFracIn<-log(dfOut$inLangEdges)/log(dfOut$edgeCount)
dfOut$zscore<-(dfOut$inLangEdges-mean(dfOut$inLangEdges))/sd(dfOut$inLangEdges)
write.csv(dfOut,"dfOut_directed.csv")
with(subset(dfOut,fracIn>0.8),plot(log(nodeCount),fracIn))
source("../tweet-langs-inms.R")
dfTweetLangs<-tweetLangAll()
dfTweetLangs<-subset(dfTweetLangs,lang %in% langList)
totalTweets<-sum(dfTweetLangs$count)
dfTweetLangs$percent<-dfTweetLangs$count/totalTweets
dfOut$tweetCount<-NA
dfOut$tweetPercent<-NA
for (lang in langList) {
dfOut$tweetCount[dfOut$lang==lang]<-dfTweetLangs$count[dfTweetLangs$lang==lang]
dfOut$tweetPercent[dfOut$lang==lang]<-dfTweetLangs$percent[dfTweetLangs$lang==lang]
}
dfOut$tweetExpEdges<-dfOut$tweetPercent*dfOut$edgeCount
dfOut$tweetdiff<-dfOut$inLangEdges-dfOut$tweetExpEdges
#Table \label{tbl:inwardness}
dfSum<-subset(dfOut,nodeCount>1000)[ , c("lang","nodeFrac","fracIn") ]
dfSum$nodeFrac<-round(dfSum$nodeFrac*100,2)
dfSum$fracIn<-round(dfSum$fracIn*100,2)
dfSum[order(dfSum$fracIn, decreasing=TRUE), ]
ggplot(dfSum,aes(label=lang,x=nodeFrac,y=fracIn))+geom_text()
#ident <- function(x) {(x/totalNodes)*totalEdges}
#
plot <- ggplot(dfOut,aes(x=nodeCount,y=fracIn)) + geom_point()
#plot <- plot + stat_function(fun = ident, linetype="dashed")
plot <- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot
#ident <- function(x) {x}
plot <- ggplot(dfOut,aes(x=nodeCount,y=diff)) + geom_point()
#plot <- plot + stat_function(fun = ident, linetype="dashed")
plot <- plot + stat_smooth(method="lm", se=TRUE)
plot <- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot <- plot + scale_y_log10("Difference from expected (log)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
plot
dfA<-data.frame(lang=rep("", N),var=rep("", N),val=rep(0, N))
dfA$lang<-dfOut$lang
dfA$var<-"fracIn"
dfA$val<-dfOut$fracIn
dfB<-data.frame(lang=rep("", N),var=rep("", N),val=rep(0, N))
dfB$lang<-dfOut$lang
dfB$var<-"fracExp"
dfB$val<-dfOut$expFrac
dfTrans<-rbind(dfA,dfB)
rm(dfA)
rm(dfB)
dfTrans$nodeCount<-dfOut$nodeCount
plot <- ggplot(dfTrans,aes(x=nodeCount,y=val,group=var,color=var)) + geom_point()
plot <- plot + scale_x_log10("Language size (log of number of users)",
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
#plot <- plot + scale_y_log10("Difference from expected (log)",
# breaks = trans_breaks("log10", function(x) 10^x),
# labels = trans_format("log10", math_format(10^.x)))
plot
#E(gLangs)$sourceLang<-""
#E(gLangs)$targetLang<-""
#edges<-get.edges(gLangs, E(gLangs))
#edgeLangs<-data.frame(source=rep("",length(edges)), target=rep("",length(edges)),stringsAsFactors=FALSE)
#i<-0
#for (edge in edges) {
# s<-edges[edge,1]
# t<-edges[edge,2]
# sLang<-V(gLangs)[s]$majLangAdj
# tLang<-V(gLangs)[t]$majLangAdj
# i<-i+1
# edgeLangs[i, ] <- c(sLang,tLang)
#}
#gCollapsed<-contract.vertices(gLangs, which(langList==V(gLangs)$majLangAdj))
levels<-as.factor(V(gLangs)$majLangAdj)
edges<-get.edges(gLangs, E(gLangs))
E(gLangs)$source<-edges[,1]
E(gLangs)$target<-edges[,2]
gCollapsed<-contract.vertices(gLangs, levels,vertex.attr.comb="first")
try({
gCollapsed<-remove.edge.attribute(gCollapsed, "sourceLang")
gCollapsed<-remove.edge.attribute(gCollapsed, "targetLang")
})
gCollapsed<-simplify(gCollapsed,remove.loops=TRUE,remove.multiple=FALSE)
E(gCollapsed)$weight<-1
gCollapsed<-simplify(gCollapsed,remove.multiple=TRUE,edge.attr.comb=list(weight="sum",source=function(x) sum(!duplicated(x)),target=function(x) sum(!duplicated(x))))
summary(gCollapsed)
V(gCollapsed)$label<-V(gCollapsed)$majLangAdj
V(gCollapsed)$name<-V(gCollapsed)$majLangAdj
V(gCollapsed)$langSize<-0
for (index in V(gCollapsed)) {
l<-V(gCollapsed)[index]$majLangAdj
V(gCollapsed)[index]$langSize<-langSizes[which(langList==l)]
}
V(gCollapsed)$langSizeLog<-log(V(gCollapsed)$langSize)
#New insertion 2013-12-19###################
#(graph without normalization / expected a la Wikipedia article (ultimately not used in published paper, but retained for reference)
E(gCollapsed)$mentionCount<-E(gCollapsed)$weight
E(gCollapsed)$weightLog<-log(E(gCollapsed)$source)
weightLogSD<-sd(E(gCollapsed)$weightLog)
weightLogMean<-mean(E(gCollapsed)$weightLog)
print(paste0("Mean: ",weightLogMean))
print(paste0("95% Mark: ",weightLogMean+(1.96*weightLogSD)))
#Add some columns for use in visualization
#Random bug, igraph doesn't write boolean values to graph output files; so, use numbers
E(gCollapsed)$weightLog_geMean<-ifelse(E(gCollapsed)$weightLog>=weightLogMean,1,0)
E(gCollapsed)$weightLog_ge1SD<-ifelse(E(gCollapsed)$weightLog>=weightLogMean+weightLogSD,1,0)
E(gCollapsed)$weightLog_ge95<-ifelse(E(gCollapsed)$weightLog>=weightLogMean+(1.96*weightLogSD),1,0)
E(gCollapsed)$weightLog_diffMean<-E(gCollapsed)$weightLog-weightLogMean
#Repeat with percent of users in source lang mention users in target lang?
edges<-get.edges(gCollapsed, E(gCollapsed))
E(gCollapsed)$weightPercent<-0
for (edge in E(gCollapsed)) {
s=V(gCollapsed)[edges[[edge,1]]]$name
E(gCollapsed)[edge]$weightPercent<-E(gCollapsed)[edge]$source/langSizes[which(langList==s)]
}
E(gCollapsed)$weight<-E(gCollapsed)$weightLog
write.graph(gCollapsed,"gCollasped_nonnormalized_usercounts.graphml",format="graphml")
#List top edges by weightlog (but give actual weight)
tmp<-get.edgelist(gCollapsed)
tmp<-cbind(as.data.frame(tmp),E(gCollapsed)$weightLog,E(gCollapsed)$source,E(gCollapsed)$weightPercent)
names(tmp)<-c("source","target","weightLog","weight","weightPercent")
#head(tmp[order(tmp$weight,decreasing=TRUE), ],n=25)
head(tmp[order(tmp$weight,decreasing=TRUE), c("source","target","weight","weightPercent")],n=7)
#Filter to ge95 and drop isolates
gTmp<-gCollapsed
gTmp<-delete.edges(gTmp,E(gTmp)[E(gTmp)$weightLog_ge95!=1])
V(gTmp)$degree<-degree(gTmp)
gTmp<-delete.vertices(gTmp,V(gTmp)$degree==0)
#Repeat everything above with English removed?
################End insertion 2013-12-19##################################
#Add missing edges with weight of 0
edgelist<-as.data.frame(get.edgelist(gCollapsed,names=TRUE))
names(edgelist)=c("source","target")
edgelist$exists<-TRUE
N<-length(langList)*(length(langList)-1)
dfEdgesFull<-data.frame(source=rep("",N),target=rep("",N),stringsAsFactors=FALSE)
i<-0
for (source in langList) {
for (target in langList) {
if (target!=source) {
i<-i+1
dfEdgesFull[i, ]<-c(source,target)
}
}
}
dfEdgesFull<-merge(x=dfEdgesFull,y=edgelist,all=TRUE)
missing<-dfEdgesFull[is.na(dfEdgesFull$exists), ]
missing<-data.frame(source=missing$source,target=missing$target)
medges<-c()
for (index in seq(1,length(missing$source))) {
medges<-c(medges,as.character(missing[index, "source"]),
as.character(missing[index, "target"])
)
}
gCollapsed<-add.edges(gCollapsed,medges,attr=list(weight=0))
summary(gCollapsed)
#V(gCollapsed)$outDegree<-degree(gCollapsed,V(gCollapsed),mode="out") #Not out-degree, sum of weight on out edges
V(gCollapsed)$outWeight<-0
E(gCollapsed)$expected<-NA
edges<-get.edges(gCollapsed, E(gCollapsed))
for (edge in seq(1,length(edges)/2)) {
s<-edges[edge,1]
V(gCollapsed)[s]$outWeight<-V(gCollapsed)[s]$outWeight + E(gCollapsed)[edge]$weight
}
for (edge in seq(1,length(edges)/2)) {
s<-edges[edge,1]
t<-edges[edge,2]
sLang<-V(gCollapsed)[s]$majLangAdj
tLang<-V(gCollapsed)[t]$majLangAdj
#TODO: Demoninator needs to be reduced by the size of the source language!!!
#sLangSize<-langSizes[which(langList==sLang)]
#E(gCollapsed)[edge]$expected<-(langSizes[which(langList==tLang)]/(totalNodes-sLangSize)) * V(gCollapsed)[s]$outWeight
E(gCollapsed)[edge]$expected<-(langSizes[which(langList==tLang)]/totalNodes) * V(gCollapsed)[s]$outWeight
}
summary(E(gCollapsed)$weight)
summary(E(gCollapsed)$expected)
E(gCollapsed)$pError<-(E(gCollapsed)$weight-E(gCollapsed)$expected)/E(gCollapsed)$expected
E(gCollapsed)$pErrorZ<-(E(gCollapsed)$pError-mean(E(gCollapsed)$pError,na.rm=TRUE))/sd(E(gCollapsed)$pError,na.rm=TRUE)
E(gCollapsed)$diffPercent<-abs(E(gCollapsed)$weight-E(gCollapsed)$expected)/((E(gCollapsed)$weight+E(gCollapsed)$expected)/2)
E(gCollapsed)$zscore<-(E(gCollapsed)$weight-mean(E(gCollapsed)$weight))/sd(E(gCollapsed)$weight)
summary(E(gCollapsed)$pError)
summary(E(gCollapsed)$zscore)
write.graph(gCollapsed,"gCollapsed.graphml",format="graphml")
gCollapsedLargeLangs<-induced.subgraph(gCollapsed,V(gCollapsed)$majLangAdj %in% largeLangs)
E(gCollapsedLargeLangs)$pErrorZ<-(E(gCollapsedLargeLangs)$pError-mean(E(gCollapsedLargeLangs)$pError,na.rm=TRUE))/sd(E(gCollapsedLargeLangs)$pError,na.rm=TRUE)
E(gCollapsedLargeLangs)$zscore<-(E(gCollapsedLargeLangs)$weight-mean(E(gCollapsedLargeLangs)$weight))/sd(E(gCollapsedLargeLangs)$weight)
write.graph(gCollapsedLargeLangs,"gCollapsedLargeLangs.graphml",format="graphml")
edgeDataset<-function(graph) {
N<-length(E(graph))
dfEdges<-data.frame(source=rep("",N),target=rep("",N),pError=rep(NA,N),zscore=rep(NA,N),pErrorZ=rep(NA,N),stringsAsFactors=FALSE)
edges<-get.edges(graph, E(graph))
for (edge in seq(1,N)) {
s<-edges[edge,1]
t<-edges[edge,2]
sLang<-V(graph)[s]$majLangAdj
tLang<-V(graph)[t]$majLangAdj
pError<-E(graph)[edge]$pError
zscore<-E(graph)[edge]$zscore
pErrorZ<-E(graph)[edge]$pErrorZ
dfEdges[edge, ]<-list(sLang,tLang,pError,zscore,pErrorZ)
}
return(dfEdges)
}
#Table \label{tbl:langlang-more}
#head(dfEdges[order(dfEdges$pError,decreasing=TRUE),c("source","target","pError")],n=10)
dfEdgesLarge<-edgeDataset(gCollapsedLargeLangs)
dfEdgesLarge$pError100<-round(dfEdgesLarge$pError*100)
head(dfEdgesLarge[order(dfEdgesLarge$pError,decreasing=TRUE),c("source","target","pError100")],n=10)
#subset(dfEdges,pErrorZ>=1.96|pErrorZ<=1.96)
dfEdges<-edgeDataset(gCollapsed)#Is this really the same? Shouldn't it be different
dfEdgesSub<-subset(dfEdges,source%in%largeLangs & target%in%largeLangs)
head(dfEdgesSub[order(dfEdgesSub$pError,decreasing=TRUE),c("source","target","pError")],n=10)
rm(dfEdgesSub)
dfEdgesSub<-subset(dfEdges,zscore > 1.96| zscore < -1.96)
dfEdgesSub[order(dfEdgesSub$zscore,decreasing=TRUE),c("source","target","zscore")]
rm(dfEdgesSub)
dfEdges$pErrorZ<-(dfEdges$pError-mean(dfEdges$pError,na.rm=TRUE))/sd(dfEdges$pError,na.rm=TRUE)
dfEdgesSub<-subset(dfEdges,pErrorZ > 1.96 | pErrorZ < -1.96)
head(dfEdgesSub[order(dfEdgesSub$pErrorZ,decreasing=TRUE),c("source","target","pError","pErrorZ")],n=10)
rm(dfEdgesSub)
dfEdgesSub<-subset(dfEdges,source%in%largeLangs & target%in%largeLangs)
dfEdgesSub$pErrorZ<-(dfEdgesSub$pError-mean(dfEdgesSub$pError,na.rm=TRUE))/sd(dfEdgesSub$pError,na.rm=TRUE)
dfEdgesSub<-subset(dfEdgesSub,pErrorZ > 1.96 | pErrorZ < -1.96)
head(dfEdgesSub[order(dfEdgesSub$pErrorZ,decreasing=TRUE),c("source","target","pError","pErrorZ")],n=10)
rm(dfEdgesSub)
#N<-length(largeLangs)*(length(largeLangs)-1)
#dfEdgesFull<-data.frame(source=rep("",N),target=rep("",N),stringsAsFactors=FALSE)
#i<-0
#for (source in largeLangs) {
# for (target in largeLangs) {
# if (target!=source) {
# i<-i+1
# dfEdgesFull[i, ]<-c(source,target)
# }
# }
#}
#
#
#dfEdgesFull<-merge(x=dfEdgesFull,y=dfEdgesLarge,all=TRUE)
#dfEdgesFull[is.na(dfEdgesFull$pError), ]
#
##Any pairs with no edges in either direction? (\label{tbl:langlang-less})
#tmp<-dfEdgesFull[is.na(dfEdgesFull$pError), c("source","target")]
#dis<-data.frame(source=tmp$source,target=tmp$target)
#dis2<-data.frame(source=tmp$target,target=tmp$source)
#reallyDis<-merge(x=dis,y=dis2,all=FALSE)
#reallyDis
#New version of R lists includes edges with weight 0, this results in pError of -1
sum(E(gCollapsedLargeLangs)$weight==0)==sum(dfEdgesLarge$pError==-1)
sum(dfEdgesLarge$pError==-1)
#24 disconnected pairs
dfTmp<-dfEdgesLarge[dfEdgesLarge$pError==-1,c("source","target")]
dfTmp$mutual<-0
for (i in seq(1,length(dfTmp$mutual))) {
s<-dfTmp[i,"source"]
t<-dfTmp[i,"target"]
if (sum(dfTmp[dfTmp$source==t, ]$target==s)==1) {
dfTmp[i,"mutual"]<-1
}
}
missingTable<-function(df) {
str<-paste0(df$source," & ")
for (i in length(df$target)) {
if (df$mutual[i]) {
str<-paste0(str,"\\emphas{",df$target[i],"},")
} else {
str<-paste0(str,df$target[i],",")
}
}
print(str)
return(str)
}
ddply(dfTmp,.(source),missingTable)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.