Last active
August 29, 2015 14:00
-
-
Save benmarwick/11324513 to your computer and use it in GitHub Desktop.
snapshot of number of unique tweets from SAA2014 (unique means unique combination of username, message text and time, so a RT is not counted a duplicate because it has a different username and time)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load csv file downloaded from google docs at 10:30am PST, 28 April 2014 | |
# https://docs.google.com/spreadsheet/ccc?key=0Alr3EPKs-tcRdGhVRFNKeHVadDhHNGdGYU84Z255X1E&usp=drive_web | |
saa2014_1 <- read.csv("C:/Users/marwick/Downloads/SAA 2014 Tweets - Archive.csv", stringsAsFactors = FALSE) | |
# Second tweet archive, csv file downloaded from google docs at 10:30am PST, 28 April 2014 | |
# https://docs.google.com/spreadsheet/ccc?key=0Ak6w3axv7XKTdHpHdFNfeFNKMk45WFVWQkhCeGdLMWc&usp=drive_web#gid=82 | |
saa2014_2 <- read.csv("C:/Users/marwick/Downloads/%23SAA2014 file 2 - Archive.csv", stringsAsFactors = FALSE) | |
# combine two files | |
saa2014 <- rbind(saa2014_1, saa2014_2) # 26,611 rows | |
# remove empty rows | |
nrow(saa2014 <- saa2014[saa2014$from_user !="",]) # 25,376 rows remainins | |
# remove duplicates | |
nrow(saa2014_dedup <- saa2014[!duplicated(saa2014),]) # 15,315 unqiue messages | |
# how many duplicates were there? | |
nrow(saa2014[duplicated(saa2014),]) # 10,061 duplicate rows | |
# how many unique tweeters? | |
length(unique(saa2014_dedup$from_user)) # 596 people tweeting |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################ get SAA abstracts ############ | |
# from http://stackoverflow.com/a/21449040/1036500 | |
# folder with 1000s of PDFs | |
dest <- "F:\\My Documents\\My Papers\\conferences\\SAA2010\\SAA_Abstracts\\PDFs" | |
# make a vector of PDF file names | |
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE) | |
# convert each PDF file that is named in the vector into a text file | |
# text file is created in the same directory as the PDFs | |
# note that my pdftotext.exe is in a different location to yours | |
lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"', paste0('"', i, '"')), wait = FALSE) ) | |
# if we just want the abstracts, we can use regex to extract that part of | |
# each as a txt file | |
# Assumes that the abstract is always between the characters ] and [ | |
abstracts_2014 <- paste0(dest, "/2014.txt") | |
j <- paste0(scan(abstracts_2014, what = character()), collapse = " ") | |
abstracts_2014_cut <- unlist(regmatches(j, gregexpr("(?<=\\]).*?(?=\\[)", j, perl = TRUE))) | |
# inspect | |
abstracts_2014_cut[1:5] | |
# drop items with less than 100 words since those are not | |
# abstracts but redirects | |
# get word counts for each list item | |
word_counts <- sapply(abstracts_2014_cut, function(i) sapply(gregexpr("\\W+", i), length) + 1) | |
# subset abstracts with less than 100 words | |
abstracts_2014_cut_subset <- abstracts_2014_cut[word_counts > 100] | |
# inspect | |
abstracts_2014_cut_subset[1:10] | |
# Write abstracts into separate txt files... | |
# write abstracts as txt files | |
# (or use them in the list for whatever you want to do next) | |
setwd("F:\\My Documents\\My Papers\\conferences\\SAA2010\\SAA_Abstracts\\2014_text_files") | |
lapply(1:length(abstracts_2014_cut_subset), function(i) write.table(abstracts_2014_cut_subset[i], file=paste("abstract", i, "txt", sep="."), quote = FALSE, row.names = FALSE, col.names = FALSE, eol = " " )) | |
################ get SAA tweets ############ | |
# Now get tweets | |
# load csv file downloaded from google docs at 9:30am PST, 26 April 2014 | |
# https://docs.google.com/spreadsheet/ccc?key=0Alr3EPKs-tcRdGhVRFNKeHVadDhHNGdGYU84Z255X1E&usp=drive_web | |
saa2014_tweets <- read.csv("C:/Users/marwick/Downloads/SAA 2014 Tweets - Archive.csv", stringsAsFactors = FALSE) | |
# combine just tweet text into one string | |
saa2014_tweets_text <- saa2014_tweets[,3] | |
# Convert each tweet text to text file | |
# https://gist.github.com/benmarwick/9278490 | |
invisible(lapply(1:length(saa2014_tweets_text), function(i) write.table(saa2014_tweets_text[i], file = paste0("tweet.", i, ".txt"), row.names = FALSE, col.names = FALSE, quote = FALSE))) | |
######## combine abstracts and tweets ############### | |
library(tm) | |
abstracts_and_tweets <- Corpus(DirSource()) | |
######## pre-process text ############### | |
tdm <- TermDocumentMatrix(abstracts_and_tweets, | |
control = list(removePunctuation = TRUE, | |
stopwords = TRUE, | |
removePunctuation = TRUE, | |
removeNumbers = TRUE, | |
removeNumbers = TRUE, | |
stripWhitespace = TRUE, | |
tolower = TRUE)) | |
# isolate terms to remove non-nouns | |
terms <- tdm$dimnames$Terms | |
# remove punctation (again) | |
terms <- gsub('[:punct:]','', terms) | |
library(NLP); library(data.table); library(openNLP) | |
tagPOS <- function(x) { | |
s <- NLP::as.String(x) | |
## Need sentence and word token annotations. | |
a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s)) | |
a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1) | |
a3 <- NLP::annotate(s, openNLP::Maxent_POS_Tag_Annotator(), a2) | |
## Determine the distribution of POS tags for word tokens. | |
a3w <- a3[a3$type == "word"] | |
POStags <- unlist(lapply(a3w$features, `[[`, "POS")) | |
## Extract token/POS pairs (all of them): easy - not needed | |
# POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ") | |
return(unlist(POStags)) | |
} | |
# divide Terms into chunks of 1000 terms each because more than that can cause | |
# memory problems | |
terms_split <- split(terms, ceiling(seq_along(terms)/1000)) | |
# loop over each chunk of 1000 terms to do POStagging, I found that trying to | |
# do 10,000 terms or more causes Java memory problems, so this is a very safe | |
# method to try not to fill memory | |
terms_split_chunks <- plyr::llply(terms_split, function(i){ | |
tmp <- paste(gsub("[^[:alnum:]]", "", i), collapse = " ") | |
tagPOS(tmp) | |
}, .progress = "text") | |
# get all the tags in a vector | |
terms_split_chunks_out <- unname(c(unlist(terms_split_chunks))) | |
# subset document term matrix terms to keep only nouns | |
tdm_nouns <- tdm[ c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ] | |
# shrink the tdm to keep performance tolerable | |
tdm_nouns_sparse <- removeSparseTerms(tdm_nouns, 0.9999) | |
######## generate topic model ############### | |
library(topicmodels) | |
saa_topics <LDA(tdm_nouns_sparse, 100) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment