Thomas Kassel thomas-kassel

## h2o.gridSearch.R
# Define hyperparameter spaces and search criteria for random grid search
GLM_params1 <- list(alpha = seq(0,1,.05),lambda = 10^seq(-7,3,0.5)) # array of values for both alpha and lambda
GLM_searchCriteria1 <- list(strategy = "RandomDiscrete", max_runtime_secs = 300) # randomly choose combinations, train no longer than 5mins

# Train models using the parameters above until the runtime limit is reached
# Save the models and their prediction scores on the validation frame in a grid, for access later
GLMgrid <- h2o.grid(algorithm = "glm", grid_id = "GLM_grid1", x = x,
                    y = y, training_frame = train, validation_frame = valid,
                    hyper_params = GLM_params1, search_criteria = GLM_searchCriteria1, seed = 1234)

## h2o.preprocessing.R
# Initiate remote h2o cluster (receives and processes dataset)
# No modeling is done locally - an address key is saved to reference the remote version
h2o.init(nthreads = -1)
# Prepare h2o inputs for modeling
recs.reduced2.h2o <- as.h2o(recs.reduced2)  # Coerce DF to an h2o object
set.seed(0)   # For reproducibility of train/test split
# Split h2o data into training, validation, and test frames
data.split <- h2o.splitFrame(recs.reduced2.h2o,ratios = c(.7,.2))
train <- data.split[[1]]  # For training
valid <- data.split[[2]]  # For validating trained models and comparing different hyperparameter vectors

## cleantechScrapy.Spider.py
from scrapy import Spider
from scrapy.spiders import CrawlSpider , Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
from scrapy.selector import Selector
from cleantechScrapy.items import GTMArticleItem
from cleantechScrapy.items import Cleantech100Item


##### Crawl Spider to scrape GTM articles #####

## helper.R
# Use Reshape2 to melt exercise data from "wide" to "long" format
# Perform left_join on sequence ID to combine exercise information with demographic
exerciseMins <- melt(data = exercise,id.vars = "seqID",
                     measure.vars = c('minsVigWork','minsModWork','minsWalkBike','minsVigRec','minsModRec'),
                     variable.name = 'exercise.type',value.name = 'mins.per.day') %>%
                     left_join(demographics,by='seqID')

## helper.R
setwd('./tables')
# Loop through directory with all input tables as CSV's
# Save as dataframes in R environment for shiny use
for (i in 1:length(dir())){
  tablename <- unlist(strsplit(dir()[i],'.csv'))[1]
  table <- read.csv(dir()[i],stringsAsFactors = F,colClasses = c(seqID='character'))
  assign(tablename,table)
}
	# Define hyperparameter spaces and search criteria for random grid search
	GLM_params1 <- list(alpha = seq(0,1,.05),lambda = 10^seq(-7,3,0.5)) # array of values for both alpha and lambda
	GLM_searchCriteria1 <- list(strategy = "RandomDiscrete", max_runtime_secs = 300) # randomly choose combinations, train no longer than 5mins

	# Train models using the parameters above until the runtime limit is reached
	# Save the models and their prediction scores on the validation frame in a grid, for access later
	GLMgrid <- h2o.grid(algorithm = "glm", grid_id = "GLM_grid1", x = x,
	y = y, training_frame = train, validation_frame = valid,
	hyper_params = GLM_params1, search_criteria = GLM_searchCriteria1, seed = 1234)
	# Initiate remote h2o cluster (receives and processes dataset)
	# No modeling is done locally - an address key is saved to reference the remote version
	h2o.init(nthreads = -1)
	# Prepare h2o inputs for modeling
	recs.reduced2.h2o <- as.h2o(recs.reduced2) # Coerce DF to an h2o object
	set.seed(0) # For reproducibility of train/test split
	# Split h2o data into training, validation, and test frames
	data.split <- h2o.splitFrame(recs.reduced2.h2o,ratios = c(.7,.2))
	train <- data.split[[1]] # For training
	valid <- data.split[[2]] # For validating trained models and comparing different hyperparameter vectors
	from scrapy import Spider
	from scrapy.spiders import CrawlSpider , Rule
	from scrapy.linkextractors import LinkExtractor
	from scrapy.http import Request
	from scrapy.selector import Selector
	from cleantechScrapy.items import GTMArticleItem
	from cleantechScrapy.items import Cleantech100Item


	##### Crawl Spider to scrape GTM articles #####
	# Use Reshape2 to melt exercise data from "wide" to "long" format
	# Perform left_join on sequence ID to combine exercise information with demographic
	exerciseMins <- melt(data = exercise,id.vars = "seqID",
	measure.vars = c('minsVigWork','minsModWork','minsWalkBike','minsVigRec','minsModRec'),
	variable.name = 'exercise.type',value.name = 'mins.per.day') %>%
	left_join(demographics,by='seqID')
	setwd('./tables')
	# Loop through directory with all input tables as CSV's
	# Save as dataframes in R environment for shiny use
	for (i in 1:length(dir())){
	tablename <- unlist(strsplit(dir()[i],'.csv'))[1]
	table <- read.csv(dir()[i],stringsAsFactors = F,colClasses = c(seqID='character'))
	assign(tablename,table)
	}