wesslen/TwitterGnipParseJson.R

## TwitterGnipParseJson.R
# install the streamR package the first time -- no need if you already have it installed
#install.packages("streamR")
library(streamR)

# functions
readGnipTweets <- function(tweets, verbose=TRUE){
  ## checking input is correct
  if (is.null(tweets)){
    stop("Error: you need to specify file or object where tweets text was stored.")
  }

  ## Read the text file and save it in memory as a list
  if (length(tweets)==1 && file.exists(tweets)){
    lines <- readLines(tweets)
  }
  else {
    lines <- tweets
  }
  ## Converting to UTF-8
  lines <- iconv(lines, "ASCII", "UTF-8", sub="")

  results.list <- lapply(lines[nchar(lines)>0], function(x) tryCatch(fromJSON(x), error=function(e) e))

  # information message
  if (verbose==TRUE) message(length(results.list), " tweets have been parsed.")
  return(results.list)
}

parseGnipTweets <- function(tweets, simplify=FALSE){

  ## from json to list
  results.list <- readGnipTweets(tweets, verbose=FALSE)

  # if no text in list, change it to NULL
  if (length(results.list)==0){
    stop(deparse(substitute(tweets)), " did not contain any tweets. ",
         "See ?parseTweets for more details.")
  }

  # constructing data frame with tweet and user variable
  # you will need to add new variables you want in the dataframe manually, using the unlistWithNA function
  df <- data.frame(
    id = substr(unlistWithNA(results.list, 'id'),29,47),
    body = unlistWithNA(results.list, 'body'),
    verb = unlistWithNA(results.list, 'verb'),
    postedTime = unlistWithNA(results.list, 'postedTime'),
    actor.id = substr(unlistWithNA(results.list, c('actor','id')),16,30),
    generator.displayName = unlistWithNA(results.list, c('generator','displayName')),
    stringsAsFactors=F)

  # remove any empty rows
  df <- df[rowSums(is.na(df)) != ncol(df),]

  return(df)
}

unlistWithNA <- function(lst, field){
  if (length(field)==1){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
  }
  if (length(field)==2){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
  }
  return(vect)
}


## run this code

# set location of Gnip JSON file
txt <-  "~/Downloads/50_activities.json"

t <- parseGnipTweets(txt)
	# install the streamR package the first time -- no need if you already have it installed
	#install.packages("streamR")
	library(streamR)

	# functions
	readGnipTweets <- function(tweets, verbose=TRUE){
	## checking input is correct
	if (is.null(tweets)){
	stop("Error: you need to specify file or object where tweets text was stored.")
	}

	## Read the text file and save it in memory as a list
	if (length(tweets)==1 && file.exists(tweets)){
	lines <- readLines(tweets)
	}
	else {
	lines <- tweets
	}
	## Converting to UTF-8
	lines <- iconv(lines, "ASCII", "UTF-8", sub="")

	results.list <- lapply(lines[nchar(lines)>0], function(x) tryCatch(fromJSON(x), error=function(e) e))

	# information message
	if (verbose==TRUE) message(length(results.list), " tweets have been parsed.")
	return(results.list)
	}

	parseGnipTweets <- function(tweets, simplify=FALSE){

	## from json to list
	results.list <- readGnipTweets(tweets, verbose=FALSE)

	# if no text in list, change it to NULL
	if (length(results.list)==0){
	stop(deparse(substitute(tweets)), " did not contain any tweets. ",
	"See ?parseTweets for more details.")
	}

	# constructing data frame with tweet and user variable
	# you will need to add new variables you want in the dataframe manually, using the unlistWithNA function
	df <- data.frame(
	id = substr(unlistWithNA(results.list, 'id'),29,47),
	body = unlistWithNA(results.list, 'body'),
	verb = unlistWithNA(results.list, 'verb'),
	postedTime = unlistWithNA(results.list, 'postedTime'),
	actor.id = substr(unlistWithNA(results.list, c('actor','id')),16,30),
	generator.displayName = unlistWithNA(results.list, c('generator','displayName')),
	stringsAsFactors=F)

	# remove any empty rows
	df <- df[rowSums(is.na(df)) != ncol(df),]

	return(df)
	}

	unlistWithNA <- function(lst, field){
	if (length(field)==1){
	notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
	vect <- rep(NA, length(lst))
	vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
	}
	if (length(field)==2){
	notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
	vect <- rep(NA, length(lst))
	vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
	}
	return(vect)
	}


	## run this code

	# set location of Gnip JSON file
	txt <- "~/Downloads/50_activities.json"

	t <- parseGnipTweets(txt)