Skip to content

Instantly share code, notes, and snippets.

@marcellooker
Last active September 11, 2017 18:08
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save marcellooker/f416f74d75db4bc1987010b9ebc96a00 to your computer and use it in GitHub Desktop.
Save marcellooker/f416f74d75db4bc1987010b9ebc96a00 to your computer and use it in GitHub Desktop.
# My R script
library(RJDBC)
library(plyr)
library("rjson") #library("rjson", lib.loc="/Library/Frameworks/R.framework/Versions/3.2/Resources/library")
library("rJava") #library("rJava", lib.loc="/Library/Frameworks/R.framework/Versions/3.2/Resources/library")
library("randomForest") #library("randomForest", lib.loc="/Library/Frameworks/R.framework/Versions/3.2/Resources/library")
library("LookR")
looker_setup( id = "myapiid",
secret = "myapisecret",
api_path = "https://mycompany.looker.com:19999/api/3.0"
)
getTrainingData <- function(){
#This is one method of getting data - best used for high volumes, with no limit
print("Getting Training Data...")
now <- Sys.time()
returnValue <- run_inline_query(model = "my_model",
view = "my_explore",
fields = c(
"my_view_1.input_variabl_a",
"my_view_1.input_variabl_b",
"my_view_2.input_variabl_c",
"my_view_2.input_variabl_d",
"my_view_2.input_variabl_e.",
"my_view_3.output_variable"
),
filters = list(
c("my_view_1.vilter_variable_a", "my_desired_value_a"),
c("my_view_1.vilter_variable_b", "my_desired_value_b")
)
)
print("Done.")
then <- Sys.time()
print((then - now))
return(returnValue)
}
getProdData <- function(){
#This is another method of getting data - best used for small volumes, since the limit is enforced, but keeps the code simpler.
print("Getting Production Data...")
returnValue <- run_look(42) #this is the number for a specific Look you've chosen.
print("Done.")
return(returnValue)
}
# GLOBAL VARIABLES
mySeed = 321
splitData <- function(data,sampleSize,seed) {
# Split the data into two sets, one for training the other for testing.
smp_size <- floor(0.75 * nrow(data))
# set the seed to make your partition reproductible
if (!is.null(seed)) set.seed(seed)
set.seed(seed)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
#split out the two sets of data
train <- data[train_ind, , drop = FALSE]
test <- data[-train_ind, , drop = FALSE]
#return a list of the two dataframes
return(list(train,test))
}
trainForest <- function(thisData){
print("Training Model....")
rf <- randomForest( my_view_3.output_variable ~
my_view_1.input_variabl_a +
my_view_1.input_variabl_b +
my_view_2.input_variabl_c +
my_view_2.input_variabl_d +
my_view_2.input_variabl_3
, data=thisData)
print("Done.")
return(rf)
}
scoreForest <- function(forest,thisData){
#overwrite the test dataframe with the predicted values and scores
print("Merging score data...")
thisData$win_score <- predict(forest,thisData, "prob")
thisData$predict <- predict(forest,thisData)
print("Done.")
return(thisData)
}
mainTrain <- function(){
print("TRAINING FUNCTION:")
myTrainData <- getTrainingData()
mySplitData <- splitData(myTrainData,.7,mySeed)
trainData <- mySplitData[[1]]
testData <- mySplitData[[2]]
rf <- trainForest(trainData)
testValues <- scoreForest(rf,testData)
return(rf)
}
mainProd <- function(randomForst){
print("PRODUCTION FUNCTION:")
myProdData <- getProdData()
scoredData <- scoreForest(randomForst,myProdData)
cleanData <- cleanColumns(scoredData)
writeToDatabase(cleanData,Sys.Date())
print("Prduction function Complete!")
return(cleanData)
}
writeToDatabase <-function(dataframe,date){
#How to write the final data back to a databse
# NOTE THIS IS HOW TO DO IT FOR REDSHIFT ONLY!!!! FOR OTHER DATABASES, PLEASE SEARCH FOR THE APPROPRIATE API ONLINE
dataframe$score_date <- rep(date,nrow(dataframe))
print("Getting driver...")
driver <- JDBC("com.amazon.redshift.jdbc41.Driver", "RedshiftJDBC41-1.1.9.1009.jar", identifier.quote="`")
url <- "jdbc:redshift://MYCOMPANY-REDSHIFT-URL:5439/mycompany?user=USERNAME&password=MYPASSWORD"
#Put the correct port number after the URL string
print("Done.")
print("Establishing connection...")
con <- dbConnect(driver, url)
print("Done.")
print("Writing to DB...")
dbWriteTable(con, "public.my_predictions_table", dataframe, overwrite=FALSE, append=TRUE) #set these parameters to
dbDisconnect(con)
print("Done.")
}
main <- function(){
random_forest_model <- mainTrain()
mainProd(random_forest_model)
}
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment