Skip to content

Instantly share code, notes, and snippets.

@teos0009
Created July 10, 2016 10:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save teos0009/3986bfdfe455a0756734d57b822c05b8 to your computer and use it in GitHub Desktop.
Save teos0009/3986bfdfe455a0756734d57b822c05b8 to your computer and use it in GitHub Desktop.
library(tm)
library(RXKCD)
library(XML)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(rJava)
library(RWeka)
library(Snowball)
library(stringr)
library(igraph)
ap.df<-read.csv("polyforce21.csv",header = TRUE, sep="," ,
stringsAsFactor=FALSE,na.strings = c("NA","","NULL"))
attach(ap.df)
names(ap.df)
summary(ap.df)
ap.sub1<-ap.df[,c(1,2,4)]#sub1 contains tweet,author and date
names(ap.sub1)
head(ap.sub1$Title)
#==select data of interest only====
grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic|Republic Polytechnic|sp|np|nyp|tp|rp|poly)((?:\\b)+)", ap.sub1$Title,ignore.case=TRUE, value=F)
#local<- grep("SingaporePoly",ap.sub1$Title,value = TRUE,ignore.case = F,fixed = TRUE)
#local <- ap.sub1$Title[grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic)((?:\\b)+)",ap.sub1$Title)]
#local <- ap.sub1$Title[grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic|Republic Polytechnic|sp|np|nyp|tp|rp|poly)((?:\\b)+)", ap.sub1$Title,ignore.case=TRUE, value=TRUE)]
local <-grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic|Republic Polytechnic|sp|np|nyp|tp|rp|poly)((?:\\b)+)", ap.sub1$Title,ignore.case=TRUE, value=T)
summary(local)
names(local)
head(local)
native <- ap.sub1[grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic|Republic Polytechnic|sp|np|nyp|tp|rp|poly)((?:\\b)+)", ap.sub1$Title,ignore.case=TRUE, value=F),]
summary(native)
names(native)
head(native)
grep("RT @temasekpoly",native$Title,ignore.case=TRUE, value=F) #only ref number in cells
grep("RT @temasekpoly",native$Title,ignore.case=TRUE, value=T) #got value in cells
#=select interested rows=
selPorsche<-ap.sub1[grep("Porsche offers internships to Ngee Ann Polytechnic",ap.sub1$Title),]
names(selPorsche)
head(selPorsche)
#=====who retweet========
summary(native)
head(native)
grep("@STcom",native$Title,ignore.case=TRUE, value=T)
native$Title
##===https://sites.google.com/site/miningtwitter/questions/user-tweets/who-retweet==
#grep("(RT|via)((?:\\b\\W*@\\w+)+)", ap.sub1$Title, ignore.case=TRUE, value=TRUE) #disp sample of grep
grep("(RT|via)((?:\\b\\W*@\\w+)+)", native$Title, ignore.case=TRUE, value=TRUE) #disp sample of grep
# which tweets are retweets
#rt_patterns = grep("(RT|via)((?:\\b\\W*@\\w+)+)",ap.sub1$Title, ignore.case=TRUE) #store grep result
rt_patterns = grep("(RT|via)((?:\\b\\W*@\\w+)+)",native$Title, ignore.case=TRUE) #store grep result
rt_patterns
# show retweets (these are the ones we want to focus on)
ap.sub1$Title[rt_patterns] #disp those grep only
native$Title[rt_patterns]
summary(ap.sub1$Title[rt_patterns])
summary(native$Title[rt_patterns])
head(ap.sub1$Title[rt_patterns])#produce diff head to native
head(native$Title[rt_patterns]) #produce diff head to sub1
#mind boggling....
grep("RT @temasekpoly", ap.sub1$Title[rt_patterns],ignore.case=TRUE, value=TRUE) #zero items
grep("RT @temasekpoly", native$Title,ignore.case=TRUE, value=TRUE) #270 items
###rt_patterns is like pointer/ref, point towards diff df got diff result
#======processs nodes and edge===
# create list to store user names
who_retweet = as.list(1:length(rt_patterns))
#head(who_retweet)
who_post = as.list(1:length(rt_patterns))
#head(who_post)
# for loop
for (i in 1:length(rt_patterns))
{
# get tweet with retweet entity
#twit = ap.sub1$Title[[rt_patterns[i]]]
#author = ap.sub1$Author[[rt_patterns[i]]]
twit = native$Title[[rt_patterns[i]]]
author = native$Author[[rt_patterns[i]]]
# get retweet source
poster = str_extract_all(twit,
"(RT|via)((?:\\b\\W*@\\w+)+)")
#remove ':'
poster = gsub(":", "", unlist(poster))
# name of retweeted user
who_post[[i]] = gsub("(RT @|via @)", "", poster, ignore.case=TRUE)
# name of retweeting user
who_retweet[[i]] = rep(author, length(poster))#rep = replicate
}
# unlist
who_post = unlist(who_post)
who_retweet = unlist(who_retweet)
#===gen igraph========
# two column matrix of edges
retweeter_poster = cbind(who_retweet, who_post)
#head(retweeter_poster)
#write.csv(retweeter_poster, file = "RTnS.csv")
# generate graph
rt_graph = graph.edgelist(retweeter_poster)
#write.graph(rt_graph, "tweetedge.txt", format=c("edgelist"))#export edge list to file
#write.graph(rt_graph, "tweetedge.pajek", format=c("pajek"))#export edge list to file
# get vertex names
ver_labs = get.vertex.attribute(rt_graph, "name", index=V(rt_graph))
#head(ver_labs)
#get vertex degree
ver_deg = degree(rt_graph)
#head(ver_deg)
V(rt_graph) [ degree(rt_graph) > 10 ] #nodes with degree above 10
#get diameter and highlight it
dia <- get.diameter(rt_graph)
E(rt_graph, path=dia)$color <- "white"
E(rt_graph, path=dia)$width <- 10
V(rt_graph)[ dia ]$label.color <- "white"
V(rt_graph)[ dia ]$color <- "white"
# choose some layout
#glay = layout.fruchterman.reingold(rt_graph)
#glay = layout.kamada.kawai(rt_graph)#very slow
glay = layout.fruchterman.reingold(rt_graph, niter = 2000,area = vcount(g)^5)
#===plot graph===============
# plot
pdf("TEST retweet net.pdf")#save as pdf
par(bg="gray15", mar=c(1,1,1,1))
plot(rt_graph, layout=glay,
vertex.color="gray25",
#vertex.color=V(rt_graph)$color,
#vertex.size=10,
vertex.size = ver_deg/100,
vertex.label= ver_labs, #ver_labs or NA not to display label
vertex.label.family="sans",
vertex.shape="none",
vertex.label.color=hsv(h=0, s=0, v=.95, alpha=0.5),
#vertex.label.cex=0.85,
vertex.label.cex=ver_deg/50,
#vertex.label.cex=log10(ver_deg),
edge.arrow.size=0.8,
edge.arrow.width=0.5,
edge.width=0.1,
edge.color=hsv(h=.95, s=1, v=.7, alpha=0.5)
#edge.color==E(rt_graph)$color
)
# add title
title("\nTweets with 'satisfied parameters': Who retweets whom",
cex.main=1, col.main="gray95")
dev.off()#end save pdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment