Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active August 29, 2015 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benmarwick/11324513 to your computer and use it in GitHub Desktop.
Save benmarwick/11324513 to your computer and use it in GitHub Desktop.
snapshot of number of unique tweets from SAA2014 (unique means unique combination of username, message text and time, so a RT is not counted a duplicate because it has a different username and time)
# load csv file downloaded from google docs at 10:30am PST, 28 April 2014
# https://docs.google.com/spreadsheet/ccc?key=0Alr3EPKs-tcRdGhVRFNKeHVadDhHNGdGYU84Z255X1E&usp=drive_web
saa2014_1 <- read.csv("C:/Users/marwick/Downloads/SAA 2014 Tweets - Archive.csv", stringsAsFactors = FALSE)
# Second tweet archive, csv file downloaded from google docs at 10:30am PST, 28 April 2014
# https://docs.google.com/spreadsheet/ccc?key=0Ak6w3axv7XKTdHpHdFNfeFNKMk45WFVWQkhCeGdLMWc&usp=drive_web#gid=82
saa2014_2 <- read.csv("C:/Users/marwick/Downloads/%23SAA2014 file 2 - Archive.csv", stringsAsFactors = FALSE)
# combine two files
saa2014 <- rbind(saa2014_1, saa2014_2) # 26,611 rows
# remove empty rows
nrow(saa2014 <- saa2014[saa2014$from_user !="",]) # 25,376 rows remainins
# remove duplicates
nrow(saa2014_dedup <- saa2014[!duplicated(saa2014),]) # 15,315 unqiue messages
# how many duplicates were there?
nrow(saa2014[duplicated(saa2014),]) # 10,061 duplicate rows
# how many unique tweeters?
length(unique(saa2014_dedup$from_user)) # 596 people tweeting
################ get SAA abstracts ############
# from http://stackoverflow.com/a/21449040/1036500
# folder with 1000s of PDFs
dest <- "F:\\My Documents\\My Papers\\conferences\\SAA2010\\SAA_Abstracts\\PDFs"
# make a vector of PDF file names
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)
# convert each PDF file that is named in the vector into a text file
# text file is created in the same directory as the PDFs
# note that my pdftotext.exe is in a different location to yours
lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"', paste0('"', i, '"')), wait = FALSE) )
# if we just want the abstracts, we can use regex to extract that part of
# each as a txt file
# Assumes that the abstract is always between the characters ] and [
abstracts_2014 <- paste0(dest, "/2014.txt")
j <- paste0(scan(abstracts_2014, what = character()), collapse = " ")
abstracts_2014_cut <- unlist(regmatches(j, gregexpr("(?<=\\]).*?(?=\\[)", j, perl = TRUE)))
# inspect
abstracts_2014_cut[1:5]
# drop items with less than 100 words since those are not
# abstracts but redirects
# get word counts for each list item
word_counts <- sapply(abstracts_2014_cut, function(i) sapply(gregexpr("\\W+", i), length) + 1)
# subset abstracts with less than 100 words
abstracts_2014_cut_subset <- abstracts_2014_cut[word_counts > 100]
# inspect
abstracts_2014_cut_subset[1:10]
# Write abstracts into separate txt files...
# write abstracts as txt files
# (or use them in the list for whatever you want to do next)
setwd("F:\\My Documents\\My Papers\\conferences\\SAA2010\\SAA_Abstracts\\2014_text_files")
lapply(1:length(abstracts_2014_cut_subset), function(i) write.table(abstracts_2014_cut_subset[i], file=paste("abstract", i, "txt", sep="."), quote = FALSE, row.names = FALSE, col.names = FALSE, eol = " " ))
################ get SAA tweets ############
# Now get tweets
# load csv file downloaded from google docs at 9:30am PST, 26 April 2014
# https://docs.google.com/spreadsheet/ccc?key=0Alr3EPKs-tcRdGhVRFNKeHVadDhHNGdGYU84Z255X1E&usp=drive_web
saa2014_tweets <- read.csv("C:/Users/marwick/Downloads/SAA 2014 Tweets - Archive.csv", stringsAsFactors = FALSE)
# combine just tweet text into one string
saa2014_tweets_text <- saa2014_tweets[,3]
# Convert each tweet text to text file
# https://gist.github.com/benmarwick/9278490
invisible(lapply(1:length(saa2014_tweets_text), function(i) write.table(saa2014_tweets_text[i], file = paste0("tweet.", i, ".txt"), row.names = FALSE, col.names = FALSE, quote = FALSE)))
######## combine abstracts and tweets ###############
library(tm)
abstracts_and_tweets <- Corpus(DirSource())
######## pre-process text ###############
tdm <- TermDocumentMatrix(abstracts_and_tweets,
control = list(removePunctuation = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE,
removeNumbers = TRUE,
stripWhitespace = TRUE,
tolower = TRUE))
# isolate terms to remove non-nouns
terms <- tdm$dimnames$Terms
# remove punctation (again)
terms <- gsub('[:punct:]','', terms)
library(NLP); library(data.table); library(openNLP)
tagPOS <- function(x) {
s <- NLP::as.String(x)
## Need sentence and word token annotations.
a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s))
a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1)
a3 <- NLP::annotate(s, openNLP::Maxent_POS_Tag_Annotator(), a2)
## Determine the distribution of POS tags for word tokens.
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
## Extract token/POS pairs (all of them): easy - not needed
# POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
return(unlist(POStags))
}
# divide Terms into chunks of 1000 terms each because more than that can cause
# memory problems
terms_split <- split(terms, ceiling(seq_along(terms)/1000))
# loop over each chunk of 1000 terms to do POStagging, I found that trying to
# do 10,000 terms or more causes Java memory problems, so this is a very safe
# method to try not to fill memory
terms_split_chunks <- plyr::llply(terms_split, function(i){
tmp <- paste(gsub("[^[:alnum:]]", "", i), collapse = " ")
tagPOS(tmp)
}, .progress = "text")
# get all the tags in a vector
terms_split_chunks_out <- unname(c(unlist(terms_split_chunks)))
# subset document term matrix terms to keep only nouns
tdm_nouns <- tdm[ c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ]
# shrink the tdm to keep performance tolerable
tdm_nouns_sparse <- removeSparseTerms(tdm_nouns, 0.9999)
######## generate topic model ###############
library(topicmodels)
saa_topics <LDA(tdm_nouns_sparse, 100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment