Last active
September 11, 2017 18:08
-
-
Save marcellooker/f416f74d75db4bc1987010b9ebc96a00 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# My R script | |
library(RJDBC) | |
library(plyr) | |
library("rjson") #library("rjson", lib.loc="/Library/Frameworks/R.framework/Versions/3.2/Resources/library") | |
library("rJava") #library("rJava", lib.loc="/Library/Frameworks/R.framework/Versions/3.2/Resources/library") | |
library("randomForest") #library("randomForest", lib.loc="/Library/Frameworks/R.framework/Versions/3.2/Resources/library") | |
library("LookR") | |
looker_setup( id = "myapiid", | |
secret = "myapisecret", | |
api_path = "https://mycompany.looker.com:19999/api/3.0" | |
) | |
getTrainingData <- function(){ | |
#This is one method of getting data - best used for high volumes, with no limit | |
print("Getting Training Data...") | |
now <- Sys.time() | |
returnValue <- run_inline_query(model = "my_model", | |
view = "my_explore", | |
fields = c( | |
"my_view_1.input_variabl_a", | |
"my_view_1.input_variabl_b", | |
"my_view_2.input_variabl_c", | |
"my_view_2.input_variabl_d", | |
"my_view_2.input_variabl_e.", | |
"my_view_3.output_variable" | |
), | |
filters = list( | |
c("my_view_1.vilter_variable_a", "my_desired_value_a"), | |
c("my_view_1.vilter_variable_b", "my_desired_value_b") | |
) | |
) | |
print("Done.") | |
then <- Sys.time() | |
print((then - now)) | |
return(returnValue) | |
} | |
getProdData <- function(){ | |
#This is another method of getting data - best used for small volumes, since the limit is enforced, but keeps the code simpler. | |
print("Getting Production Data...") | |
returnValue <- run_look(42) #this is the number for a specific Look you've chosen. | |
print("Done.") | |
return(returnValue) | |
} | |
# GLOBAL VARIABLES | |
mySeed = 321 | |
splitData <- function(data,sampleSize,seed) { | |
# Split the data into two sets, one for training the other for testing. | |
smp_size <- floor(0.75 * nrow(data)) | |
# set the seed to make your partition reproductible | |
if (!is.null(seed)) set.seed(seed) | |
set.seed(seed) | |
train_ind <- sample(seq_len(nrow(data)), size = smp_size) | |
#split out the two sets of data | |
train <- data[train_ind, , drop = FALSE] | |
test <- data[-train_ind, , drop = FALSE] | |
#return a list of the two dataframes | |
return(list(train,test)) | |
} | |
trainForest <- function(thisData){ | |
print("Training Model....") | |
rf <- randomForest( my_view_3.output_variable ~ | |
my_view_1.input_variabl_a + | |
my_view_1.input_variabl_b + | |
my_view_2.input_variabl_c + | |
my_view_2.input_variabl_d + | |
my_view_2.input_variabl_3 | |
, data=thisData) | |
print("Done.") | |
return(rf) | |
} | |
scoreForest <- function(forest,thisData){ | |
#overwrite the test dataframe with the predicted values and scores | |
print("Merging score data...") | |
thisData$win_score <- predict(forest,thisData, "prob") | |
thisData$predict <- predict(forest,thisData) | |
print("Done.") | |
return(thisData) | |
} | |
mainTrain <- function(){ | |
print("TRAINING FUNCTION:") | |
myTrainData <- getTrainingData() | |
mySplitData <- splitData(myTrainData,.7,mySeed) | |
trainData <- mySplitData[[1]] | |
testData <- mySplitData[[2]] | |
rf <- trainForest(trainData) | |
testValues <- scoreForest(rf,testData) | |
return(rf) | |
} | |
mainProd <- function(randomForst){ | |
print("PRODUCTION FUNCTION:") | |
myProdData <- getProdData() | |
scoredData <- scoreForest(randomForst,myProdData) | |
cleanData <- cleanColumns(scoredData) | |
writeToDatabase(cleanData,Sys.Date()) | |
print("Prduction function Complete!") | |
return(cleanData) | |
} | |
writeToDatabase <-function(dataframe,date){ | |
#How to write the final data back to a databse | |
# NOTE THIS IS HOW TO DO IT FOR REDSHIFT ONLY!!!! FOR OTHER DATABASES, PLEASE SEARCH FOR THE APPROPRIATE API ONLINE | |
dataframe$score_date <- rep(date,nrow(dataframe)) | |
print("Getting driver...") | |
driver <- JDBC("com.amazon.redshift.jdbc41.Driver", "RedshiftJDBC41-1.1.9.1009.jar", identifier.quote="`") | |
url <- "jdbc:redshift://MYCOMPANY-REDSHIFT-URL:5439/mycompany?user=USERNAME&password=MYPASSWORD" | |
#Put the correct port number after the URL string | |
print("Done.") | |
print("Establishing connection...") | |
con <- dbConnect(driver, url) | |
print("Done.") | |
print("Writing to DB...") | |
dbWriteTable(con, "public.my_predictions_table", dataframe, overwrite=FALSE, append=TRUE) #set these parameters to | |
dbDisconnect(con) | |
print("Done.") | |
} | |
main <- function(){ | |
random_forest_model <- mainTrain() | |
mainProd(random_forest_model) | |
} | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment