drivendata/functions_to_keep_it_fresh.R

## functions_to_keep_it_fresh.R
# INSTALL JSON PACKAGE IF NEEDED
#install.packages("jsonlite")
library("jsonlite")

# ===== FUNCTIONS FOR ID MATCHING ============
getReverseIds <- function(pathToRestaurantIdsCSV){
  # load the id map from the csv
  bosToYelp <- read.csv(pathToRestaurantIdsCSV, na.strings="")

  # for storing the inverse mapping
  reversedColumns <- list()

  for(j in seq(2, ncol(bosToYelp))){
    # get each column and the restaurant ids
    reverse <- bosToYelp[, c(1, j)]

    # drop any nans
    reverse <- reverse[complete.cases(reverse),]

    # switch the rownames to the yelp id
    row.names(reverse) <- reverse[, 2]

    # drop the yelp id column
    reversedColumns[[j]] <- reverse[, 1, drop=F]
  }

  # stack the reversed columns
  return(do.call("rbind", reversedColumns))
}

replaceYelpWithBostonIds <- function(yelpDataFrame, pathToRestaurantIdsCSV){
  # make the yelp id the index and the bos id the value
  yelpToBos <- getReverseIds(pathToRestaurantIdsCSV)

  # replace whatever the ids in the dataframe
  yelpDataFrame$business_id <- yelpToBos[match(yelpDataFrame$business_id, row.names(yelpToBos)), ]

  return(yelpDataFrame)
}

# ===== FUNCTIONS FOR LOADING YELP DATA ============
loadYelpData <- function(pathToYelpJson, pathToRestaurantIdsCSV){
  # get all the lines in the file and separate by a ","
  jsonData <- paste(readLines(pathToYelpJson), collapse=",")

  # add "[" "]" to tell JSON parser it is an array
  jsonData <- paste(c("[", jsonData, "]"), collapse="")

  # parse the data into a 2d data frame
  yelpDf <- fromJSON(jsonData, flatten=TRUE)

  # replace yelp ids with boston ids if this file has business ids
  if(is.element("business_id", colnames(yelpDf))){
    yelpDf <- replaceYelpWithBostonIds(yelpDf, pathToRestaurantIdsCSV)
  }

  return(yelpDf)
}

# ===== FUNCTIONS FOR INSPECTION DATA ===========
loadInspections <- function(pathToInspectionsCSV){
  inspections <- read.csv(pathToInspectionsCSV,
                          header=T,
                          row.names=1)

  colnames(inspections) <- c("date", "restaurant_id", "one_star", "two_stars", "three_stars")

  return(inspections)
}

writeSubmission <- function(predictionDataFrame, pathToSubmissionFormat, submissionFileName="new_submission.csv"){
  # get the submission format from the file
  submissionFormat <- read.csv(pathToSubmissionFormat, check.names=FALSE, row.names=1)

  # update the predictions with the proper column names
  colnames(predictionDataFrame) <- colnames(submissionFormat)

  # write the predictions to a file
  write.csv(predictionDataFrame, submissionFileName)
}

# ====================================
#      LOAD THE DATA FROM DISK
# ====================================

# Load all of the datums!!
businesses <- loadYelpData("data/yelp_academic_dataset_business.json", "data/restaurant_ids_to_yelp_ids.csv")
reviews <- loadYelpData("data/yelp_academic_dataset_review.json", "data/restaurant_ids_to_yelp_ids.csv")
checkins <- loadYelpData("data/yelp_academic_dataset_checkin.json", "data/restaurant_ids_to_yelp_ids.csv")
users <- loadYelpData("data/yelp_academic_dataset_user.json", "data/restaurant_ids_to_yelp_ids.csv")
tips <- loadYelpData("data/yelp_academic_dataset_tip.json", "data/restaurant_ids_to_yelp_ids.csv")

train <- loadInspections("data/train_labels.csv")
test <- loadInspections("data/SubmissionFormat.csv")

# ====================================
#      Make a simple test model
# ====================================
makeSimplePredictions <- function(){
  # We will just create a linear model fit on the number of reviews and the average
  # number of stars for the restaurant
  X_train <- businesses[match(train$restaurant_id, businesses$business_id), c("stars", "review_count")]
  X_test <- businesses[match(test$restaurant_id, businesses$business_id), c("stars", "review_count")]

  finalPredictions <- test

  for(starLevel in c("one_star", "two_stars", "three_stars")){
    # create formula for this level of violations
    formula <- paste(c(starLevel, " ~ stars + review_count"), collapse="")

    # fit a simple linear model
    model <- lm(formula=formula, data=cbind(train, X_train))

    # predict the violations
    predictions <- predict.lm(model, cbind(test, X_test))

    # force them to be integers (counts)
    predictions <- as.integer(predictions)

    # force them to be greater than 0
    predictions[predictions < 0] <- 0

    # store the predictions
    finalPredictions[starLevel] <- predictions
  }

  writeSubmission(finalPredictions, "data/SubmissionFormat.csv", "stars_review_count.csv")
}
makeSimplePredictions()
	# INSTALL JSON PACKAGE IF NEEDED
	#install.packages("jsonlite")
	library("jsonlite")

	# ===== FUNCTIONS FOR ID MATCHING ============
	getReverseIds <- function(pathToRestaurantIdsCSV){
	# load the id map from the csv
	bosToYelp <- read.csv(pathToRestaurantIdsCSV, na.strings="")

	# for storing the inverse mapping
	reversedColumns <- list()

	for(j in seq(2, ncol(bosToYelp))){
	# get each column and the restaurant ids
	reverse <- bosToYelp[, c(1, j)]

	# drop any nans
	reverse <- reverse[complete.cases(reverse),]

	# switch the rownames to the yelp id
	row.names(reverse) <- reverse[, 2]

	# drop the yelp id column
	reversedColumns[[j]] <- reverse[, 1, drop=F]
	}

	# stack the reversed columns
	return(do.call("rbind", reversedColumns))
	}

	replaceYelpWithBostonIds <- function(yelpDataFrame, pathToRestaurantIdsCSV){
	# make the yelp id the index and the bos id the value
	yelpToBos <- getReverseIds(pathToRestaurantIdsCSV)

	# replace whatever the ids in the dataframe
	yelpDataFrame$business_id <- yelpToBos[match(yelpDataFrame$business_id, row.names(yelpToBos)), ]

	return(yelpDataFrame)
	}

	# ===== FUNCTIONS FOR LOADING YELP DATA ============
	loadYelpData <- function(pathToYelpJson, pathToRestaurantIdsCSV){
	# get all the lines in the file and separate by a ","
	jsonData <- paste(readLines(pathToYelpJson), collapse=",")

	# add "[" "]" to tell JSON parser it is an array
	jsonData <- paste(c("[", jsonData, "]"), collapse="")

	# parse the data into a 2d data frame
	yelpDf <- fromJSON(jsonData, flatten=TRUE)

	# replace yelp ids with boston ids if this file has business ids
	if(is.element("business_id", colnames(yelpDf))){
	yelpDf <- replaceYelpWithBostonIds(yelpDf, pathToRestaurantIdsCSV)
	}

	return(yelpDf)
	}

	# ===== FUNCTIONS FOR INSPECTION DATA ===========
	loadInspections <- function(pathToInspectionsCSV){
	inspections <- read.csv(pathToInspectionsCSV,
	header=T,
	row.names=1)

	colnames(inspections) <- c("date", "restaurant_id", "one_star", "two_stars", "three_stars")

	return(inspections)
	}

	writeSubmission <- function(predictionDataFrame, pathToSubmissionFormat, submissionFileName="new_submission.csv"){
	# get the submission format from the file
	submissionFormat <- read.csv(pathToSubmissionFormat, check.names=FALSE, row.names=1)

	# update the predictions with the proper column names
	colnames(predictionDataFrame) <- colnames(submissionFormat)

	# write the predictions to a file
	write.csv(predictionDataFrame, submissionFileName)
	}

	# ====================================
	# LOAD THE DATA FROM DISK
	# ====================================

	# Load all of the datums!!
	businesses <- loadYelpData("data/yelp_academic_dataset_business.json", "data/restaurant_ids_to_yelp_ids.csv")
	reviews <- loadYelpData("data/yelp_academic_dataset_review.json", "data/restaurant_ids_to_yelp_ids.csv")
	checkins <- loadYelpData("data/yelp_academic_dataset_checkin.json", "data/restaurant_ids_to_yelp_ids.csv")
	users <- loadYelpData("data/yelp_academic_dataset_user.json", "data/restaurant_ids_to_yelp_ids.csv")
	tips <- loadYelpData("data/yelp_academic_dataset_tip.json", "data/restaurant_ids_to_yelp_ids.csv")

	train <- loadInspections("data/train_labels.csv")
	test <- loadInspections("data/SubmissionFormat.csv")

	# ====================================
	# Make a simple test model
	# ====================================
	makeSimplePredictions <- function(){
	# We will just create a linear model fit on the number of reviews and the average
	# number of stars for the restaurant
	X_train <- businesses[match(train$restaurant_id, businesses$business_id), c("stars", "review_count")]
	X_test <- businesses[match(test$restaurant_id, businesses$business_id), c("stars", "review_count")]

	finalPredictions <- test

	for(starLevel in c("one_star", "two_stars", "three_stars")){
	# create formula for this level of violations
	formula <- paste(c(starLevel, " ~ stars + review_count"), collapse="")

	# fit a simple linear model
	model <- lm(formula=formula, data=cbind(train, X_train))

	# predict the violations
	predictions <- predict.lm(model, cbind(test, X_test))

	# force them to be integers (counts)
	predictions <- as.integer(predictions)

	# force them to be greater than 0
	predictions[predictions < 0] <- 0

	# store the predictions
	finalPredictions[starLevel] <- predictions
	}

	writeSubmission(finalPredictions, "data/SubmissionFormat.csv", "stars_review_count.csv")
	}
	makeSimplePredictions()