Skip to content

Instantly share code, notes, and snippets.

@wesslen
Created June 3, 2017 18:26
Show Gist options
  • Save wesslen/2ab6356cf0e7890344bc19e16d65318c to your computer and use it in GitHub Desktop.
Save wesslen/2ab6356cf0e7890344bc19e16d65318c to your computer and use it in GitHub Desktop.
Code to parse Gnip JSON to R Dataframe using streamR functions
# install the streamR package the first time -- no need if you already have it installed
#install.packages("streamR")
library(streamR)
# functions
readGnipTweets <- function(tweets, verbose=TRUE){
## checking input is correct
if (is.null(tweets)){
stop("Error: you need to specify file or object where tweets text was stored.")
}
## Read the text file and save it in memory as a list
if (length(tweets)==1 && file.exists(tweets)){
lines <- readLines(tweets)
}
else {
lines <- tweets
}
## Converting to UTF-8
lines <- iconv(lines, "ASCII", "UTF-8", sub="")
results.list <- lapply(lines[nchar(lines)>0], function(x) tryCatch(fromJSON(x), error=function(e) e))
# information message
if (verbose==TRUE) message(length(results.list), " tweets have been parsed.")
return(results.list)
}
parseGnipTweets <- function(tweets, simplify=FALSE){
## from json to list
results.list <- readGnipTweets(tweets, verbose=FALSE)
# if no text in list, change it to NULL
if (length(results.list)==0){
stop(deparse(substitute(tweets)), " did not contain any tweets. ",
"See ?parseTweets for more details.")
}
# constructing data frame with tweet and user variable
# you will need to add new variables you want in the dataframe manually, using the unlistWithNA function
df <- data.frame(
id = substr(unlistWithNA(results.list, 'id'),29,47),
body = unlistWithNA(results.list, 'body'),
verb = unlistWithNA(results.list, 'verb'),
postedTime = unlistWithNA(results.list, 'postedTime'),
actor.id = substr(unlistWithNA(results.list, c('actor','id')),16,30),
generator.displayName = unlistWithNA(results.list, c('generator','displayName')),
stringsAsFactors=F)
# remove any empty rows
df <- df[rowSums(is.na(df)) != ncol(df),]
return(df)
}
unlistWithNA <- function(lst, field){
if (length(field)==1){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
}
if (length(field)==2){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
}
return(vect)
}
## run this code
# set location of Gnip JSON file
txt <- "~/Downloads/50_activities.json"
t <- parseGnipTweets(txt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment