Skip to content

Instantly share code, notes, and snippets.

View thomas-kassel's full-sized avatar

Thomas Kassel thomas-kassel

  • Form Energy
  • San Francisco, CA
View GitHub Profile
@thomas-kassel
thomas-kassel / h2o.gridSearch.R
Last active June 2, 2017 12:27
Grid search with early stopping in h2o GLM
# Define hyperparameter spaces and search criteria for random grid search
GLM_params1 <- list(alpha = seq(0,1,.05),lambda = 10^seq(-7,3,0.5)) # array of values for both alpha and lambda
GLM_searchCriteria1 <- list(strategy = "RandomDiscrete", max_runtime_secs = 300) # randomly choose combinations, train no longer than 5mins
# Train models using the parameters above until the runtime limit is reached
# Save the models and their prediction scores on the validation frame in a grid, for access later
GLMgrid <- h2o.grid(algorithm = "glm", grid_id = "GLM_grid1", x = x,
y = y, training_frame = train, validation_frame = valid,
hyper_params = GLM_params1, search_criteria = GLM_searchCriteria1, seed = 1234)
@thomas-kassel
thomas-kassel / h2o.preprocessing.R
Last active May 31, 2017 22:55
Example of ML preprocessing in h2o (RECS dataset)
# Initiate remote h2o cluster (receives and processes dataset)
# No modeling is done locally - an address key is saved to reference the remote version
h2o.init(nthreads = -1)
# Prepare h2o inputs for modeling
recs.reduced2.h2o <- as.h2o(recs.reduced2) # Coerce DF to an h2o object
set.seed(0) # For reproducibility of train/test split
# Split h2o data into training, validation, and test frames
data.split <- h2o.splitFrame(recs.reduced2.h2o,ratios = c(.7,.2))
train <- data.split[[1]] # For training
valid <- data.split[[2]] # For validating trained models and comparing different hyperparameter vectors
@thomas-kassel
thomas-kassel / cleantechScrapy.Spider.py
Created March 4, 2017 16:38
Crawl spider to extract GTM article information
from scrapy import Spider
from scrapy.spiders import CrawlSpider , Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
from scrapy.selector import Selector
from cleantechScrapy.items import GTMArticleItem
from cleantechScrapy.items import Cleantech100Item
##### Crawl Spider to scrape GTM articles #####
@thomas-kassel
thomas-kassel / helper.R
Created February 11, 2017 17:01
Join NHANES tables by sequence ID
# Use Reshape2 to melt exercise data from "wide" to "long" format
# Perform left_join on sequence ID to combine exercise information with demographic
exerciseMins <- melt(data = exercise,id.vars = "seqID",
measure.vars = c('minsVigWork','minsModWork','minsWalkBike','minsVigRec','minsModRec'),
variable.name = 'exercise.type',value.name = 'mins.per.day') %>%
left_join(demographics,by='seqID')
@thomas-kassel
thomas-kassel / helper.R
Created February 11, 2017 16:55
Import of NHANES data tables
setwd('./tables')
# Loop through directory with all input tables as CSV's
# Save as dataframes in R environment for shiny use
for (i in 1:length(dir())){
tablename <- unlist(strsplit(dir()[i],'.csv'))[1]
table <- read.csv(dir()[i],stringsAsFactors = F,colClasses = c(seqID='character'))
assign(tablename,table)
}