Skip to content

Instantly share code, notes, and snippets.

@drivendata
Created May 28, 2015 15:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save drivendata/52dbb123980e8a62e5fa to your computer and use it in GitHub Desktop.
Save drivendata/52dbb123980e8a62e5fa to your computer and use it in GitHub Desktop.
functions_to_keep_it_fresh.R
# INSTALL JSON PACKAGE IF NEEDED
#install.packages("jsonlite")
library("jsonlite")
# ===== FUNCTIONS FOR ID MATCHING ============
getReverseIds <- function(pathToRestaurantIdsCSV){
# load the id map from the csv
bosToYelp <- read.csv(pathToRestaurantIdsCSV, na.strings="")
# for storing the inverse mapping
reversedColumns <- list()
for(j in seq(2, ncol(bosToYelp))){
# get each column and the restaurant ids
reverse <- bosToYelp[, c(1, j)]
# drop any nans
reverse <- reverse[complete.cases(reverse),]
# switch the rownames to the yelp id
row.names(reverse) <- reverse[, 2]
# drop the yelp id column
reversedColumns[[j]] <- reverse[, 1, drop=F]
}
# stack the reversed columns
return(do.call("rbind", reversedColumns))
}
replaceYelpWithBostonIds <- function(yelpDataFrame, pathToRestaurantIdsCSV){
# make the yelp id the index and the bos id the value
yelpToBos <- getReverseIds(pathToRestaurantIdsCSV)
# replace whatever the ids in the dataframe
yelpDataFrame$business_id <- yelpToBos[match(yelpDataFrame$business_id, row.names(yelpToBos)), ]
return(yelpDataFrame)
}
# ===== FUNCTIONS FOR LOADING YELP DATA ============
loadYelpData <- function(pathToYelpJson, pathToRestaurantIdsCSV){
# get all the lines in the file and separate by a ","
jsonData <- paste(readLines(pathToYelpJson), collapse=",")
# add "[" "]" to tell JSON parser it is an array
jsonData <- paste(c("[", jsonData, "]"), collapse="")
# parse the data into a 2d data frame
yelpDf <- fromJSON(jsonData, flatten=TRUE)
# replace yelp ids with boston ids if this file has business ids
if(is.element("business_id", colnames(yelpDf))){
yelpDf <- replaceYelpWithBostonIds(yelpDf, pathToRestaurantIdsCSV)
}
return(yelpDf)
}
# ===== FUNCTIONS FOR INSPECTION DATA ===========
loadInspections <- function(pathToInspectionsCSV){
inspections <- read.csv(pathToInspectionsCSV,
header=T,
row.names=1)
colnames(inspections) <- c("date", "restaurant_id", "one_star", "two_stars", "three_stars")
return(inspections)
}
writeSubmission <- function(predictionDataFrame, pathToSubmissionFormat, submissionFileName="new_submission.csv"){
# get the submission format from the file
submissionFormat <- read.csv(pathToSubmissionFormat, check.names=FALSE, row.names=1)
# update the predictions with the proper column names
colnames(predictionDataFrame) <- colnames(submissionFormat)
# write the predictions to a file
write.csv(predictionDataFrame, submissionFileName)
}
# ====================================
# LOAD THE DATA FROM DISK
# ====================================
# Load all of the datums!!
businesses <- loadYelpData("data/yelp_academic_dataset_business.json", "data/restaurant_ids_to_yelp_ids.csv")
reviews <- loadYelpData("data/yelp_academic_dataset_review.json", "data/restaurant_ids_to_yelp_ids.csv")
checkins <- loadYelpData("data/yelp_academic_dataset_checkin.json", "data/restaurant_ids_to_yelp_ids.csv")
users <- loadYelpData("data/yelp_academic_dataset_user.json", "data/restaurant_ids_to_yelp_ids.csv")
tips <- loadYelpData("data/yelp_academic_dataset_tip.json", "data/restaurant_ids_to_yelp_ids.csv")
train <- loadInspections("data/train_labels.csv")
test <- loadInspections("data/SubmissionFormat.csv")
# ====================================
# Make a simple test model
# ====================================
makeSimplePredictions <- function(){
# We will just create a linear model fit on the number of reviews and the average
# number of stars for the restaurant
X_train <- businesses[match(train$restaurant_id, businesses$business_id), c("stars", "review_count")]
X_test <- businesses[match(test$restaurant_id, businesses$business_id), c("stars", "review_count")]
finalPredictions <- test
for(starLevel in c("one_star", "two_stars", "three_stars")){
# create formula for this level of violations
formula <- paste(c(starLevel, " ~ stars + review_count"), collapse="")
# fit a simple linear model
model <- lm(formula=formula, data=cbind(train, X_train))
# predict the violations
predictions <- predict.lm(model, cbind(test, X_test))
# force them to be integers (counts)
predictions <- as.integer(predictions)
# force them to be greater than 0
predictions[predictions < 0] <- 0
# store the predictions
finalPredictions[starLevel] <- predictions
}
writeSubmission(finalPredictions, "data/SubmissionFormat.csv", "stars_review_count.csv")
}
makeSimplePredictions()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment