Created
June 3, 2017 18:26
-
-
Save wesslen/2ab6356cf0e7890344bc19e16d65318c to your computer and use it in GitHub Desktop.
Code to parse Gnip JSON to R Dataframe using streamR functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# install the streamR package the first time -- no need if you already have it installed | |
#install.packages("streamR") | |
library(streamR) | |
# functions | |
readGnipTweets <- function(tweets, verbose=TRUE){ | |
## checking input is correct | |
if (is.null(tweets)){ | |
stop("Error: you need to specify file or object where tweets text was stored.") | |
} | |
## Read the text file and save it in memory as a list | |
if (length(tweets)==1 && file.exists(tweets)){ | |
lines <- readLines(tweets) | |
} | |
else { | |
lines <- tweets | |
} | |
## Converting to UTF-8 | |
lines <- iconv(lines, "ASCII", "UTF-8", sub="") | |
results.list <- lapply(lines[nchar(lines)>0], function(x) tryCatch(fromJSON(x), error=function(e) e)) | |
# information message | |
if (verbose==TRUE) message(length(results.list), " tweets have been parsed.") | |
return(results.list) | |
} | |
parseGnipTweets <- function(tweets, simplify=FALSE){ | |
## from json to list | |
results.list <- readGnipTweets(tweets, verbose=FALSE) | |
# if no text in list, change it to NULL | |
if (length(results.list)==0){ | |
stop(deparse(substitute(tweets)), " did not contain any tweets. ", | |
"See ?parseTweets for more details.") | |
} | |
# constructing data frame with tweet and user variable | |
# you will need to add new variables you want in the dataframe manually, using the unlistWithNA function | |
df <- data.frame( | |
id = substr(unlistWithNA(results.list, 'id'),29,47), | |
body = unlistWithNA(results.list, 'body'), | |
verb = unlistWithNA(results.list, 'verb'), | |
postedTime = unlistWithNA(results.list, 'postedTime'), | |
actor.id = substr(unlistWithNA(results.list, c('actor','id')),16,30), | |
generator.displayName = unlistWithNA(results.list, c('generator','displayName')), | |
stringsAsFactors=F) | |
# remove any empty rows | |
df <- df[rowSums(is.na(df)) != ncol(df),] | |
return(df) | |
} | |
unlistWithNA <- function(lst, field){ | |
if (length(field)==1){ | |
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]]))) | |
vect <- rep(NA, length(lst)) | |
vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field)) | |
} | |
if (length(field)==2){ | |
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]]))) | |
vect <- rep(NA, length(lst)) | |
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]])) | |
} | |
return(vect) | |
} | |
## run this code | |
# set location of Gnip JSON file | |
txt <- "~/Downloads/50_activities.json" | |
t <- parseGnipTweets(txt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment