benmarwick/saa2014-tweet-count.r

## saa2014-tweet-count.r
# load csv file downloaded from google docs at 10:30am PST, 28 April 2014
# https://docs.google.com/spreadsheet/ccc?key=0Alr3EPKs-tcRdGhVRFNKeHVadDhHNGdGYU84Z255X1E&usp=drive_web
saa2014_1 <- read.csv("C:/Users/marwick/Downloads/SAA 2014 Tweets - Archive.csv", stringsAsFactors = FALSE)

# Second tweet archive, csv file downloaded from google docs at 10:30am PST, 28 April 2014
# https://docs.google.com/spreadsheet/ccc?key=0Ak6w3axv7XKTdHpHdFNfeFNKMk45WFVWQkhCeGdLMWc&usp=drive_web#gid=82
saa2014_2 <- read.csv("C:/Users/marwick/Downloads/%23SAA2014 file 2 - Archive.csv", stringsAsFactors = FALSE)

# combine two files
saa2014 <- rbind(saa2014_1, saa2014_2) # 26,611 rows

# remove empty rows
nrow(saa2014 <- saa2014[saa2014$from_user !="",]) # 25,376 rows remainins

# remove duplicates
nrow(saa2014_dedup <- saa2014[!duplicated(saa2014),]) # 15,315 unqiue messages

# how many duplicates were there?
nrow(saa2014[duplicated(saa2014),]) # 10,061 duplicate rows

# how many unique tweeters?
length(unique(saa2014_dedup$from_user)) # 596 people tweeting

## tweets-vs-abstracts.r
################ get SAA abstracts ############

# from http://stackoverflow.com/a/21449040/1036500
# folder with 1000s of PDFs
dest <- "F:\\My Documents\\My Papers\\conferences\\SAA2010\\SAA_Abstracts\\PDFs"

# make a vector of PDF file names
myfiles <- list.files(path = dest, pattern = "pdf",  full.names = TRUE)

# convert each PDF file that is named in the vector into a text file
# text file is created in the same directory as the PDFs
# note that my pdftotext.exe is in a different location to yours
lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"', paste0('"', i, '"')), wait = FALSE) )

# if we just want the abstracts, we can use regex to extract that part of
# each as a txt file
# Assumes that the abstract is always between the characters ] and [
abstracts_2014 <- paste0(dest, "/2014.txt")
j <- paste0(scan(abstracts_2014, what = character()), collapse = " ")
abstracts_2014_cut <- unlist(regmatches(j, gregexpr("(?<=\\]).*?(?=\\[)", j,  perl = TRUE)))

# inspect
abstracts_2014_cut[1:5]

# drop items with less than 100 words since those are not
# abstracts but redirects

# get word counts for each list item
word_counts <- sapply(abstracts_2014_cut, function(i) sapply(gregexpr("\\W+", i), length) + 1)

# subset abstracts with less than 100 words
abstracts_2014_cut_subset <- abstracts_2014_cut[word_counts > 100]

# inspect
abstracts_2014_cut_subset[1:10]

# Write abstracts into separate txt files...

# write abstracts as txt files
# (or use them in the list for whatever you want to do next)
setwd("F:\\My Documents\\My Papers\\conferences\\SAA2010\\SAA_Abstracts\\2014_text_files")
lapply(1:length(abstracts_2014_cut_subset),  function(i) write.table(abstracts_2014_cut_subset[i], file=paste("abstract", i, "txt", sep="."), quote = FALSE, row.names = FALSE, col.names = FALSE, eol = " " ))

################ get SAA tweets ############

# Now get tweets
# load csv file downloaded from google docs at 9:30am PST, 26 April 2014
# https://docs.google.com/spreadsheet/ccc?key=0Alr3EPKs-tcRdGhVRFNKeHVadDhHNGdGYU84Z255X1E&usp=drive_web
saa2014_tweets <- read.csv("C:/Users/marwick/Downloads/SAA 2014 Tweets - Archive.csv", stringsAsFactors = FALSE)

# combine just tweet text into one string
saa2014_tweets_text <- saa2014_tweets[,3]

# Convert each tweet text to text file
# https://gist.github.com/benmarwick/9278490
invisible(lapply(1:length(saa2014_tweets_text), function(i) write.table(saa2014_tweets_text[i], file = paste0("tweet.", i, ".txt"), row.names = FALSE, col.names = FALSE, quote = FALSE)))

######## combine abstracts and tweets ###############
library(tm)
abstracts_and_tweets <- Corpus(DirSource())

######## pre-process text ###############

tdm <- TermDocumentMatrix(abstracts_and_tweets,
                          control = list(removePunctuation = TRUE,
                                         stopwords = TRUE,
                                         removePunctuation = TRUE,
                                         removeNumbers = TRUE,
                                         removeNumbers = TRUE,
                                         stripWhitespace = TRUE,
                                         tolower = TRUE))

# isolate terms to remove non-nouns
terms <- tdm$dimnames$Terms

# remove punctation (again)
terms <- gsub('[:punct:]','', terms)

library(NLP); library(data.table); library(openNLP)
tagPOS <-  function(x) {

  s <- NLP::as.String(x)
  ## Need sentence and word token annotations.

  a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s))
  a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1)
  a3 <- NLP::annotate(s,  openNLP::Maxent_POS_Tag_Annotator(), a2)

  ## Determine the distribution of POS tags for word tokens.
  a3w <- a3[a3$type == "word"]
  POStags <- unlist(lapply(a3w$features, `[[`, "POS"))

  ## Extract token/POS pairs (all of them): easy - not needed
  # POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
  return(unlist(POStags))
}


# divide Terms into chunks of 1000 terms each because more than that can cause
# memory problems
terms_split <- split(terms, ceiling(seq_along(terms)/1000))

# loop over each chunk of 1000 terms to do POStagging, I found that trying to
# do 10,000 terms or more causes Java memory problems, so this is a very safe
# method to try not to fill memory
terms_split_chunks <- plyr::llply(terms_split, function(i){
  tmp <- paste(gsub("[^[:alnum:]]", "", i), collapse = " ")
  tagPOS(tmp)
}, .progress = "text")

# get all the tags in a vector
terms_split_chunks_out <- unname(c(unlist(terms_split_chunks)))

# subset document term matrix terms to keep only nouns
tdm_nouns <- tdm[  c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ]

# shrink the tdm to keep performance tolerable
tdm_nouns_sparse <- removeSparseTerms(tdm_nouns, 0.9999)


######## generate topic model ###############
library(topicmodels)
saa_topics <LDA(tdm_nouns_sparse, 100)
	# load csv file downloaded from google docs at 10:30am PST, 28 April 2014
	# https://docs.google.com/spreadsheet/ccc?key=0Alr3EPKs-tcRdGhVRFNKeHVadDhHNGdGYU84Z255X1E&usp=drive_web
	saa2014_1 <- read.csv("C:/Users/marwick/Downloads/SAA 2014 Tweets - Archive.csv", stringsAsFactors = FALSE)

	# Second tweet archive, csv file downloaded from google docs at 10:30am PST, 28 April 2014
	# https://docs.google.com/spreadsheet/ccc?key=0Ak6w3axv7XKTdHpHdFNfeFNKMk45WFVWQkhCeGdLMWc&usp=drive_web#gid=82
	saa2014_2 <- read.csv("C:/Users/marwick/Downloads/%23SAA2014 file 2 - Archive.csv", stringsAsFactors = FALSE)

	# combine two files
	saa2014 <- rbind(saa2014_1, saa2014_2) # 26,611 rows

	# remove empty rows
	nrow(saa2014 <- saa2014[saa2014$from_user !="",]) # 25,376 rows remainins

	# remove duplicates
	nrow(saa2014_dedup <- saa2014[!duplicated(saa2014),]) # 15,315 unqiue messages

	# how many duplicates were there?
	nrow(saa2014[duplicated(saa2014),]) # 10,061 duplicate rows

	# how many unique tweeters?
	length(unique(saa2014_dedup$from_user)) # 596 people tweeting
	################ get SAA abstracts ############

	# from http://stackoverflow.com/a/21449040/1036500
	# folder with 1000s of PDFs
	dest <- "F:\\My Documents\\My Papers\\conferences\\SAA2010\\SAA_Abstracts\\PDFs"

	# make a vector of PDF file names
	myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)

	# convert each PDF file that is named in the vector into a text file
	# text file is created in the same directory as the PDFs
	# note that my pdftotext.exe is in a different location to yours
	lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"', paste0('"', i, '"')), wait = FALSE) )

	# if we just want the abstracts, we can use regex to extract that part of
	# each as a txt file
	# Assumes that the abstract is always between the characters ] and [
	abstracts_2014 <- paste0(dest, "/2014.txt")
	j <- paste0(scan(abstracts_2014, what = character()), collapse = " ")
	abstracts_2014_cut <- unlist(regmatches(j, gregexpr("(?<=\\]).*?(?=\\[)", j, perl = TRUE)))

	# inspect
	abstracts_2014_cut[1:5]

	# drop items with less than 100 words since those are not
	# abstracts but redirects

	# get word counts for each list item
	word_counts <- sapply(abstracts_2014_cut, function(i) sapply(gregexpr("\\W+", i), length) + 1)

	# subset abstracts with less than 100 words
	abstracts_2014_cut_subset <- abstracts_2014_cut[word_counts > 100]

	# inspect
	abstracts_2014_cut_subset[1:10]

	# Write abstracts into separate txt files...

	# write abstracts as txt files
	# (or use them in the list for whatever you want to do next)
	setwd("F:\\My Documents\\My Papers\\conferences\\SAA2010\\SAA_Abstracts\\2014_text_files")
	lapply(1:length(abstracts_2014_cut_subset), function(i) write.table(abstracts_2014_cut_subset[i], file=paste("abstract", i, "txt", sep="."), quote = FALSE, row.names = FALSE, col.names = FALSE, eol = " " ))

	################ get SAA tweets ############

	# Now get tweets
	# load csv file downloaded from google docs at 9:30am PST, 26 April 2014
	# https://docs.google.com/spreadsheet/ccc?key=0Alr3EPKs-tcRdGhVRFNKeHVadDhHNGdGYU84Z255X1E&usp=drive_web
	saa2014_tweets <- read.csv("C:/Users/marwick/Downloads/SAA 2014 Tweets - Archive.csv", stringsAsFactors = FALSE)

	# combine just tweet text into one string
	saa2014_tweets_text <- saa2014_tweets[,3]

	# Convert each tweet text to text file
	# https://gist.github.com/benmarwick/9278490
	invisible(lapply(1:length(saa2014_tweets_text), function(i) write.table(saa2014_tweets_text[i], file = paste0("tweet.", i, ".txt"), row.names = FALSE, col.names = FALSE, quote = FALSE)))

	######## combine abstracts and tweets ###############
	library(tm)
	abstracts_and_tweets <- Corpus(DirSource())

	######## pre-process text ###############

	tdm <- TermDocumentMatrix(abstracts_and_tweets,
	control = list(removePunctuation = TRUE,
	stopwords = TRUE,
	removePunctuation = TRUE,
	removeNumbers = TRUE,
	removeNumbers = TRUE,
	stripWhitespace = TRUE,
	tolower = TRUE))

	# isolate terms to remove non-nouns
	terms <- tdm$dimnames$Terms

	# remove punctation (again)
	terms <- gsub('[:punct:]','', terms)

	library(NLP); library(data.table); library(openNLP)
	tagPOS <- function(x) {

	s <- NLP::as.String(x)
	## Need sentence and word token annotations.

	a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s))
	a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1)
	a3 <- NLP::annotate(s, openNLP::Maxent_POS_Tag_Annotator(), a2)

	## Determine the distribution of POS tags for word tokens.
	a3w <- a3[a3$type == "word"]
	POStags <- unlist(lapply(a3w$features, `[[`, "POS"))

	## Extract token/POS pairs (all of them): easy - not needed
	# POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
	return(unlist(POStags))
	}


	# divide Terms into chunks of 1000 terms each because more than that can cause
	# memory problems
	terms_split <- split(terms, ceiling(seq_along(terms)/1000))

	# loop over each chunk of 1000 terms to do POStagging, I found that trying to
	# do 10,000 terms or more causes Java memory problems, so this is a very safe
	# method to try not to fill memory
	terms_split_chunks <- plyr::llply(terms_split, function(i){
	tmp <- paste(gsub("[^[:alnum:]]", "", i), collapse = " ")
	tagPOS(tmp)
	}, .progress = "text")

	# get all the tags in a vector
	terms_split_chunks_out <- unname(c(unlist(terms_split_chunks)))

	# subset document term matrix terms to keep only nouns
	tdm_nouns <- tdm[ c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ]

	# shrink the tdm to keep performance tolerable
	tdm_nouns_sparse <- removeSparseTerms(tdm_nouns, 0.9999)


	######## generate topic model ###############
	library(topicmodels)
	saa_topics <LDA(tdm_nouns_sparse, 100)