Skip to content

Instantly share code, notes, and snippets.

@kateto
Created March 29, 2015 05:12
Show Gist options
  • Save kateto/7da13df064fffaa3b8ab to your computer and use it in GitHub Desktop.
Save kateto/7da13df064fffaa3b8ab to your computer and use it in GitHub Desktop.
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
## Download and examine deleted congress tweets ##
## Data Source: politwoops.sunlightfoundation.com ##
## Analysis: Katherine Ognyanova at www.kateto.net ##
## Visualizations: http://kateto.net/politwoops ##
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
library(RJSONIO)
library(RCurl)
library(plyr)
options(stringsAsFactors = FALSE)
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
# Data:
# pw.df - deleted tweets data frame
# pol.inf - politician info data frame
# pw.dfs - deleted tweets + politician info data frame
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
## Get Politwoops data using package PolitwoopsR ##
install.packages('devtools')
library(devtools)
install_github('kateto/PolitwoopsR')
library(PolitwoopsR)
# Get tweet JSONs:
pw.df <- get_pw_tweets()
dim(pw.df)
colnames(pw.df)
# Get politician data:
pol.inf <- get_pw_pol()
dim(pol.inf)
head(pol.inf)
# Combine tweet and politician data:
pw.dfs <- merge_pw(pw.df, pol.inf)
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
## TWEET URLs: Expand and extract domains ##
# Expand URLs:
pw.dfs$url.decoded <- url_expand(pw.df)
# Extract domains from the URLs:
pw.dfs$url.domain <- url_domain(pw.dfs$url.decoded)
# Any short URLs left in the data after expanding are are problematic -
# they may have been shortened multuple times, or they might be
# broken, incorrect, expired, or pointing to sites flagged as unsafe.
problem.urls <- which( !is.na(pw.dfs$url.decoded) & (grepl("//t\\.co", pw.dfs$url.decoded) |
grepl("/ow\\.ly", pw.dfs$url.decoded) |
grepl("/bit\\.ly", pw.dfs$url.decoded) |
grepl("tinyurl", pw.dfs$url.decoded)))
pw.dfs$url.decoded[problem.urls]
# We could remove those, or give them another pass first to extract the double-shortened urls:
for(i in problem.urls) {
tryCatch({pw.dfs$url.decoded[i] <- getURLContent(pw.dfs$url.decoded[i], header=T)$header["Location"] },
error = function(err){print(paste("ERROR: ",err, "at #", i)); return("Incorrect/broken link")}) }
pw.dfs$url.domain <- url_domain(pw.dfs$url.decoded)
pw.dfs$url.domain <- tolower(pw.dfs$url.domain)
detach(package:twitteR)
detach(library:XML)
detach(library:RCurl)
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
## TWEET TEXT: clean, stem, stem complete, top terms ##
# Detect tweet language:
library("textcat")
languages <- TC_byte_profiles[names(TC_byte_profiles) %in% c("english", "spanish")]
pw.dfs$lang <- textcat(pw.dfs$cclean , p=languages)
pw.dfs$lang[is.na(pw.dfs$lang)] <- "en"
pw.dfs$lang[pw.dfs$lang=="english"] <- "en"
pw.dfs$lang[pw.dfs$lang=="spanish"] <- "es"
count(pw.dfs$lang)
detach(package:textcat)
# Clean the text:
pw.dfs$cclean <- pw.dfs$content
pw.dfs$cclean <- clean_text(pw.dfs$cclean)
# Removing stopwords, punctuation, numbers:
library("RWeka")
library("tm")
library("SnowballC")
tw.stop <- c(stopwords('english'), "twitter", "tweets", "tweet", "retweet",
"tweeting", "account", "rt", "via", "cc", "ht")
pw.dfs$cclean <- removeWords(pw.dfs$cclean, tw.stop)
pw.dfs$cclean <- removePunctuation(pw.dfs$cclean)
pw.dfs$cclean <- removeNumbers(pw.dfs$cclean)
# Stemming:
stemrw <- function(txt){paste(LovinsStemmer(WordTokenizer(txt)), collapse=" ")}
stemrtm <- function(txt){paste(wordStem(WordTokenizer(txt)), collapse=" ")}
stemc <- function(x, d){ (paste(stemCompletion(WordTokenizer(x), dictionary = d), collapse=" ")) }
dict <- WordTokenizer(pw.dfs$cclean)
pw.dfs$cstem <- apply(as.data.frame(pw.dfs$cclean), 1, stemrw)
# Note that the stemmer appears to have problems with terms that end in t -
# e.g. it stems "meet" to "mees", "get" to "ges", etc.
# Those might have to be fixed manually later when doing stem completion.
# Create a corpus:
text.corpus <- Corpus(VectorSource(pw.dfs$cstem ))
# Building a Document-Term Matrix:
text.dtm <- TermDocumentMatrix(text.corpus, control = list(minWordLength = 1))
inspect(text.dtm)
# Terms that occur more than 100 times:
findFreqTerms(text.dtm, lowfreq = 100)
# Term frequency:
termFreq <- rowSums(as.matrix(text.dtm))
# which words are associated with "government" and "Obama"?
# Note that words are stemmed:
findAssocs(text.dtm, 'governm', 0.05)
findAssocs(text.dtm, 'obam', 0.05)
# Top 150 terms - with stem completion:
text.mat <- as.matrix(text.dtm)
top.terms <- data.frame(terms = stemCompletion(names(sort(rowSums(text.mat), decreasing=T)[1:150]), dict),
freq = sort(rowSums(text.mat), decreasing=T)[1:150], stringsAsFactors=F)
detach(package:RWeka)
detach(package:tm)
detach(package:SnowballC)
# Top term wordcloud:
library(wordcloud)
library(RColorBrewer)
pald <- brewer.pal(8,"Dark2")
wordcloud(top.terms$terms, top.terms$freq, min.freq=100, rot.per=.15, colors=pald)
detach(package:wordcloud)
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
## TWEET HASHTAGS: extract all & by party ##
library("plyr")
library("RWeka")
all.tags <- grep("^#.+", WordTokenizer(tolower(pw.dfs$content)), value=T)
rep.tags <- grep("^#.+", WordTokenizer(tolower(pw.dfs$content[pw.dfs$party=="Rep"])), value=T)
dem.tags <- grep("^#.+", WordTokenizer(tolower(pw.dfs$content[pw.dfs$party=="Dem"])), value=T)
all.tags <- count(all.tags)[order(count(all.tags)[,2], decreasing=T),]
rep.tags <- count(rep.tags)[order(count(rep.tags)[,2], decreasing=T),]
dem.tags <- count(dem.tags)[order(count(dem.tags)[,2], decreasing=T),]
detach(package:RWeka)
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
## TWEET SENTIMENT: sentiment and polarity ##
library(tm)
library(tm.plugin.sentiment)
# NOTE THAT THE tm_tag_score funciton has been renamed to tm_term_score !
tm_tag_score <- tm_term_score
pw.dfs$tw.polarity <- polarity(text.dtm)
pw.dfs$tw.polarity[is.na(pw.dfs$tw.polarity) | is.nan(pw.dfs$tw.polarity)] <- 0
pol.pos <- pos_refs_per_ref(text.dtm)
pol.neg <- neg_refs_per_ref(text.dtm)
detach(package:tm.plugin.sentiment)
detach(package:tm)
library(qdap)
qdap.polarity <- polarity(pw.dfs$cclean)$all
pw.dfs$word.count <- qdap.polarity$wc
pw.dfs$tw.polarity2 <- qdap.polarity$polarity
pw.dfs$tw.polarity2[is.na(pw.dfs$tw.polarity2) | is.nan(pw.dfs$tw.polarity2)] <- 0
detach(package:qdap)
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
## COLLECT ADDITIONAL DATA FROM TWITTER ##
library(twitteR)
# Create an app at http://apps.twitter.com/
# Go to the "Keys and access token" tab and get the key info to fill below:
# (if access_token & access_secret are not included, browser authentification is required.
auth <- function(){ setup_twitter_oauth(consumer_key="CONSUMER_KEY_HERE",
consumer_secret="CONSUMER_SECRET_HERE")
# access_token="ACCESS_TOKEN_HERE",
# access_secret="ACCESS_SECRET_HERE")
}
auth()
# Twiter usernames from Politwoops deleted tweet data:
polit.user <- unique(pw.dfs$twitter)
polit.id <- unique(pw.dfs$id)
# Get info about the users from the Twitter API:
polit.twinfo <- lookupUsers(polit.user)
# The twitteR class behaves a bit odd at times, so easier to work with a list extracted from it:
polit.tw <- c("id", "name", "statusesCount", "followersCount", "friendsCount", "profileImageUrl",
"created", "verified", "location", "screenName")
polit.tw <- lapply(polit.twinfo, function(x){ y <- list(); for(i in tw.fields) y[[i]] <- x[[i]]; y })
# Download Twitter user profile images
# (to use in visualizations)
library("png")
library("jpeg")
img.dld <- data.frame( file.url=sapply(polit.tw, function(x){x$profileImageUrl}),
user.id=sapply(polit.tw, function(x){x$id}),
user.name=names(polit.tw), stringsAsFactors=F )
img.dld$file.url <- sub("_normal","", img.dld$file.url, fixed=T)
img.dld$file.name <- paste0(img.dld$user.name, sub(".*(\\.[^\\.]+$)","\\1", img.dld$file.url))
img.dld$file.name <- gsub("\\.jpeg","\\.jpg", img.dld$file.name)
img.dld <- img.dld[order(img.dld$user.id),]
for (i in 1:nrow(img.dld) ) {
download.file(img.dld$file.url[i], img.dld$file.name[i], mode = 'wb') }
detach(package:jpeg)
detach(package:png)
# Get info about the followers, followees & timeline of each politician:
# Rotate the type of data obtained from twitter (through get.ind$iter),
# and pause for sleep.minutes when the Twitter rate limit is reached
# (this is slow due to rate limits and may take a few days to run).
get.ind <- data.frame(fr=1, fol=1, tw=1, current="fr", iter=0)
sleep.minutes <- 5
while(any(get.ind[,1:3] <= length(polit.tw))) {
get.ind$iter <- get.ind$iter+1
tryCatch({ if(get.ind$fr <= length(polit.tw) & get.ind$iter %% 3==1) {
get.ind$current <- "fr"
rorl <- ifelse(polit.tw[[get.ind$fr]]$friendsCount < 5000, 0, 3000) # the API only serves 5000 at a time
polit.tw[[get.ind$fr]]$friend.ids <- polit.twinfo[[get.ind$fr]]$getFriendIDs(retryOnRateLimit=rorl)
get.ind$fr <- get.ind$fr+1 }
if(get.ind$fol <= length(polit.tw) & get.ind$iter %% 3==2) {
get.ind$current <- "fol"
rorl <- ifelse(polit.tw[[get.ind$fr]]$followersCount < 5000, 0, 3000) # the API only serves 5000 at a time
polit.tw[[get.ind$fol]]$follower.ids <- polit.twinfo[[get.ind$fol]]$getFollowerIDs(retryOnRateLimit=rorl)
get.ind$fol <- get.ind$fol+1 }
if(get.ind$tw <= length(polit.tw) & get.ind$iter %% 3==0) {
get.ind$current <- "tw"
polit.tw[[get.ind$tw]]$tweets <- userTimeline(names(polit.tw)[get.ind$tw], retryOnRateLimit=0)
get.ind$tw <- get.ind$tw+1 }
},
warning = function(warn){ print(paste("Warning at ", get.ind$current, "=", get.ind[get.ind$current], "and time =", Sys.time()))
Sys.sleep(sleep.minutes*60) } ) }
detach(package:twitteR)
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv##
@PaulKantor
Copy link

Very impressive. --

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment