Created
March 29, 2015 05:12
-
-
Save kateto/7da13df064fffaa3b8ab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
## Download and examine deleted congress tweets ## | |
## Data Source: politwoops.sunlightfoundation.com ## | |
## Analysis: Katherine Ognyanova at www.kateto.net ## | |
## Visualizations: http://kateto.net/politwoops ## | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
library(RJSONIO) | |
library(RCurl) | |
library(plyr) | |
options(stringsAsFactors = FALSE) | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
# Data: | |
# pw.df - deleted tweets data frame | |
# pol.inf - politician info data frame | |
# pw.dfs - deleted tweets + politician info data frame | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
## Get Politwoops data using package PolitwoopsR ## | |
install.packages('devtools') | |
library(devtools) | |
install_github('kateto/PolitwoopsR') | |
library(PolitwoopsR) | |
# Get tweet JSONs: | |
pw.df <- get_pw_tweets() | |
dim(pw.df) | |
colnames(pw.df) | |
# Get politician data: | |
pol.inf <- get_pw_pol() | |
dim(pol.inf) | |
head(pol.inf) | |
# Combine tweet and politician data: | |
pw.dfs <- merge_pw(pw.df, pol.inf) | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
## TWEET URLs: Expand and extract domains ## | |
# Expand URLs: | |
pw.dfs$url.decoded <- url_expand(pw.df) | |
# Extract domains from the URLs: | |
pw.dfs$url.domain <- url_domain(pw.dfs$url.decoded) | |
# Any short URLs left in the data after expanding are are problematic - | |
# they may have been shortened multuple times, or they might be | |
# broken, incorrect, expired, or pointing to sites flagged as unsafe. | |
problem.urls <- which( !is.na(pw.dfs$url.decoded) & (grepl("//t\\.co", pw.dfs$url.decoded) | | |
grepl("/ow\\.ly", pw.dfs$url.decoded) | | |
grepl("/bit\\.ly", pw.dfs$url.decoded) | | |
grepl("tinyurl", pw.dfs$url.decoded))) | |
pw.dfs$url.decoded[problem.urls] | |
# We could remove those, or give them another pass first to extract the double-shortened urls: | |
for(i in problem.urls) { | |
tryCatch({pw.dfs$url.decoded[i] <- getURLContent(pw.dfs$url.decoded[i], header=T)$header["Location"] }, | |
error = function(err){print(paste("ERROR: ",err, "at #", i)); return("Incorrect/broken link")}) } | |
pw.dfs$url.domain <- url_domain(pw.dfs$url.decoded) | |
pw.dfs$url.domain <- tolower(pw.dfs$url.domain) | |
detach(package:twitteR) | |
detach(library:XML) | |
detach(library:RCurl) | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
## TWEET TEXT: clean, stem, stem complete, top terms ## | |
# Detect tweet language: | |
library("textcat") | |
languages <- TC_byte_profiles[names(TC_byte_profiles) %in% c("english", "spanish")] | |
pw.dfs$lang <- textcat(pw.dfs$cclean , p=languages) | |
pw.dfs$lang[is.na(pw.dfs$lang)] <- "en" | |
pw.dfs$lang[pw.dfs$lang=="english"] <- "en" | |
pw.dfs$lang[pw.dfs$lang=="spanish"] <- "es" | |
count(pw.dfs$lang) | |
detach(package:textcat) | |
# Clean the text: | |
pw.dfs$cclean <- pw.dfs$content | |
pw.dfs$cclean <- clean_text(pw.dfs$cclean) | |
# Removing stopwords, punctuation, numbers: | |
library("RWeka") | |
library("tm") | |
library("SnowballC") | |
tw.stop <- c(stopwords('english'), "twitter", "tweets", "tweet", "retweet", | |
"tweeting", "account", "rt", "via", "cc", "ht") | |
pw.dfs$cclean <- removeWords(pw.dfs$cclean, tw.stop) | |
pw.dfs$cclean <- removePunctuation(pw.dfs$cclean) | |
pw.dfs$cclean <- removeNumbers(pw.dfs$cclean) | |
# Stemming: | |
stemrw <- function(txt){paste(LovinsStemmer(WordTokenizer(txt)), collapse=" ")} | |
stemrtm <- function(txt){paste(wordStem(WordTokenizer(txt)), collapse=" ")} | |
stemc <- function(x, d){ (paste(stemCompletion(WordTokenizer(x), dictionary = d), collapse=" ")) } | |
dict <- WordTokenizer(pw.dfs$cclean) | |
pw.dfs$cstem <- apply(as.data.frame(pw.dfs$cclean), 1, stemrw) | |
# Note that the stemmer appears to have problems with terms that end in t - | |
# e.g. it stems "meet" to "mees", "get" to "ges", etc. | |
# Those might have to be fixed manually later when doing stem completion. | |
# Create a corpus: | |
text.corpus <- Corpus(VectorSource(pw.dfs$cstem )) | |
# Building a Document-Term Matrix: | |
text.dtm <- TermDocumentMatrix(text.corpus, control = list(minWordLength = 1)) | |
inspect(text.dtm) | |
# Terms that occur more than 100 times: | |
findFreqTerms(text.dtm, lowfreq = 100) | |
# Term frequency: | |
termFreq <- rowSums(as.matrix(text.dtm)) | |
# which words are associated with "government" and "Obama"? | |
# Note that words are stemmed: | |
findAssocs(text.dtm, 'governm', 0.05) | |
findAssocs(text.dtm, 'obam', 0.05) | |
# Top 150 terms - with stem completion: | |
text.mat <- as.matrix(text.dtm) | |
top.terms <- data.frame(terms = stemCompletion(names(sort(rowSums(text.mat), decreasing=T)[1:150]), dict), | |
freq = sort(rowSums(text.mat), decreasing=T)[1:150], stringsAsFactors=F) | |
detach(package:RWeka) | |
detach(package:tm) | |
detach(package:SnowballC) | |
# Top term wordcloud: | |
library(wordcloud) | |
library(RColorBrewer) | |
pald <- brewer.pal(8,"Dark2") | |
wordcloud(top.terms$terms, top.terms$freq, min.freq=100, rot.per=.15, colors=pald) | |
detach(package:wordcloud) | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
## TWEET HASHTAGS: extract all & by party ## | |
library("plyr") | |
library("RWeka") | |
all.tags <- grep("^#.+", WordTokenizer(tolower(pw.dfs$content)), value=T) | |
rep.tags <- grep("^#.+", WordTokenizer(tolower(pw.dfs$content[pw.dfs$party=="Rep"])), value=T) | |
dem.tags <- grep("^#.+", WordTokenizer(tolower(pw.dfs$content[pw.dfs$party=="Dem"])), value=T) | |
all.tags <- count(all.tags)[order(count(all.tags)[,2], decreasing=T),] | |
rep.tags <- count(rep.tags)[order(count(rep.tags)[,2], decreasing=T),] | |
dem.tags <- count(dem.tags)[order(count(dem.tags)[,2], decreasing=T),] | |
detach(package:RWeka) | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
## TWEET SENTIMENT: sentiment and polarity ## | |
library(tm) | |
library(tm.plugin.sentiment) | |
# NOTE THAT THE tm_tag_score funciton has been renamed to tm_term_score ! | |
tm_tag_score <- tm_term_score | |
pw.dfs$tw.polarity <- polarity(text.dtm) | |
pw.dfs$tw.polarity[is.na(pw.dfs$tw.polarity) | is.nan(pw.dfs$tw.polarity)] <- 0 | |
pol.pos <- pos_refs_per_ref(text.dtm) | |
pol.neg <- neg_refs_per_ref(text.dtm) | |
detach(package:tm.plugin.sentiment) | |
detach(package:tm) | |
library(qdap) | |
qdap.polarity <- polarity(pw.dfs$cclean)$all | |
pw.dfs$word.count <- qdap.polarity$wc | |
pw.dfs$tw.polarity2 <- qdap.polarity$polarity | |
pw.dfs$tw.polarity2[is.na(pw.dfs$tw.polarity2) | is.nan(pw.dfs$tw.polarity2)] <- 0 | |
detach(package:qdap) | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
## COLLECT ADDITIONAL DATA FROM TWITTER ## | |
library(twitteR) | |
# Create an app at http://apps.twitter.com/ | |
# Go to the "Keys and access token" tab and get the key info to fill below: | |
# (if access_token & access_secret are not included, browser authentification is required. | |
auth <- function(){ setup_twitter_oauth(consumer_key="CONSUMER_KEY_HERE", | |
consumer_secret="CONSUMER_SECRET_HERE") | |
# access_token="ACCESS_TOKEN_HERE", | |
# access_secret="ACCESS_SECRET_HERE") | |
} | |
auth() | |
# Twiter usernames from Politwoops deleted tweet data: | |
polit.user <- unique(pw.dfs$twitter) | |
polit.id <- unique(pw.dfs$id) | |
# Get info about the users from the Twitter API: | |
polit.twinfo <- lookupUsers(polit.user) | |
# The twitteR class behaves a bit odd at times, so easier to work with a list extracted from it: | |
polit.tw <- c("id", "name", "statusesCount", "followersCount", "friendsCount", "profileImageUrl", | |
"created", "verified", "location", "screenName") | |
polit.tw <- lapply(polit.twinfo, function(x){ y <- list(); for(i in tw.fields) y[[i]] <- x[[i]]; y }) | |
# Download Twitter user profile images | |
# (to use in visualizations) | |
library("png") | |
library("jpeg") | |
img.dld <- data.frame( file.url=sapply(polit.tw, function(x){x$profileImageUrl}), | |
user.id=sapply(polit.tw, function(x){x$id}), | |
user.name=names(polit.tw), stringsAsFactors=F ) | |
img.dld$file.url <- sub("_normal","", img.dld$file.url, fixed=T) | |
img.dld$file.name <- paste0(img.dld$user.name, sub(".*(\\.[^\\.]+$)","\\1", img.dld$file.url)) | |
img.dld$file.name <- gsub("\\.jpeg","\\.jpg", img.dld$file.name) | |
img.dld <- img.dld[order(img.dld$user.id),] | |
for (i in 1:nrow(img.dld) ) { | |
download.file(img.dld$file.url[i], img.dld$file.name[i], mode = 'wb') } | |
detach(package:jpeg) | |
detach(package:png) | |
# Get info about the followers, followees & timeline of each politician: | |
# Rotate the type of data obtained from twitter (through get.ind$iter), | |
# and pause for sleep.minutes when the Twitter rate limit is reached | |
# (this is slow due to rate limits and may take a few days to run). | |
get.ind <- data.frame(fr=1, fol=1, tw=1, current="fr", iter=0) | |
sleep.minutes <- 5 | |
while(any(get.ind[,1:3] <= length(polit.tw))) { | |
get.ind$iter <- get.ind$iter+1 | |
tryCatch({ if(get.ind$fr <= length(polit.tw) & get.ind$iter %% 3==1) { | |
get.ind$current <- "fr" | |
rorl <- ifelse(polit.tw[[get.ind$fr]]$friendsCount < 5000, 0, 3000) # the API only serves 5000 at a time | |
polit.tw[[get.ind$fr]]$friend.ids <- polit.twinfo[[get.ind$fr]]$getFriendIDs(retryOnRateLimit=rorl) | |
get.ind$fr <- get.ind$fr+1 } | |
if(get.ind$fol <= length(polit.tw) & get.ind$iter %% 3==2) { | |
get.ind$current <- "fol" | |
rorl <- ifelse(polit.tw[[get.ind$fr]]$followersCount < 5000, 0, 3000) # the API only serves 5000 at a time | |
polit.tw[[get.ind$fol]]$follower.ids <- polit.twinfo[[get.ind$fol]]$getFollowerIDs(retryOnRateLimit=rorl) | |
get.ind$fol <- get.ind$fol+1 } | |
if(get.ind$tw <= length(polit.tw) & get.ind$iter %% 3==0) { | |
get.ind$current <- "tw" | |
polit.tw[[get.ind$tw]]$tweets <- userTimeline(names(polit.tw)[get.ind$tw], retryOnRateLimit=0) | |
get.ind$tw <- get.ind$tw+1 } | |
}, | |
warning = function(warn){ print(paste("Warning at ", get.ind$current, "=", get.ind[get.ind$current], "and time =", Sys.time())) | |
Sys.sleep(sleep.minutes*60) } ) } | |
detach(package:twitteR) | |
##vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv## | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Very impressive. --