Skip to content

Instantly share code, notes, and snippets.

View bgweber's full-sized avatar

Ben Weber bgweber

View GitHub Profile
library(boot)
data <- read.csv("UserSessions.csv")
# Function for computing the difference of differences
run_DiD <- function(data, indices){
d <- data[indices,]
new <- mean(d$postval[d$group=='Test'])/mean(d$priorval[d$group=='Test'])
old <-mean(d$postval[d$expgroup=='Control'])/mean(d$priorval[d$expgroup=='Control'])
return((new - old)/old * 100.0)
}
library(CausalImpact)
data <- read.csv(file = "DailySessions.csv")
# Create a DataFrame and plot the input data
ts <- cbind(data$test, data$control)
matplot(ts, type = "l")
# Use two week prior and post periods and plot results
pre.period <- c(1, 14)
post.period <- c(15, 30)
@bgweber
bgweber / feature_generation.py
Created September 16, 2018 19:35
Generating Features for Raw Event Data
import featuretools as ft
rawEventsDF = ... # load from data warehouse
# 1-hot encode the raw event data
es = ft.EntitySet(id="events")
es = es.entity_from_dataframe(entity_id="events", dataframe=rawDataDF)
feature_matrix, defs = ft.dfs(entityset=es, target_entity="events", max_depth=1)
encodedDF, encoders = ft.encode_features(feature_matrix, defs)
# create feature encodings for the event and description fields
es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(entity_id="plays", dataframe=plays_df, index="play_id",
variable_types = { "event": ft.variable_types.Categorical,
"description": ft.variable_types.Categorical })
f1 = Feature(es["plays"]["event"])
f2 = Feature(es["plays"]["description"])
encoded, _= ft.encode_features(plays_df, [f1, f2], top_n=10)
import numpy as np
import pandas as pd
# load the boston data set
from sklearn.datasets import load_boston
boston = load_boston()
# convert to a Pandas Data Frame
boston_pd = pd.DataFrame(data= np.c_[boston['data'],boston['target']],
columns= np.append(boston['feature_names'], 'target')).sample(frac=1)
# linear regresion with Spark
from pyspark.ml.regression import LinearRegression
# linear regression
lr = LinearRegression(maxIter=10, regParam=0.1,
elasticNetParam=0.5, labelCol="target")
# Fit the model
model = lr.fit(boston_train)
boston_pred = model.transform(boston_test)
from pyspark.ml.feature import VectorAssembler
# convert to a Spark data frame
boston_sp = spark.createDataFrame(boston_pd)
display(boston_sp.take(5))
# split into training and test spark data frames
boston_train = spark.createDataFrame(boston_pd[:400])
boston_test = spark.createDataFrame(boston_pd[400:])
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
crossval = CrossValidator(estimator=LinearRegression(labelCol = "target"),
estimatorParamMaps=ParamGridBuilder().addGrid(
LinearRegression.elasticNetParam, [0, 0.5, 1.0]).build(),
evaluator=RegressionEvaluator(
labelCol = "target", metricName = "r2"),
numFolds=10)
# sklearn version
from sklearn.ensemble import RandomForestRegressor as RFR
from multiprocessing.pool import ThreadPool
# allow up to 5 concurrent threads
pool = ThreadPool(5)
# hyperparameters to test out (n_trees)
parameters = [ 10, 20, 50]
# spark version
from pyspark.ml.regression import RandomForestRegressor
# define a function to train a RF model and return metrics
def mllib_random_forest(trees, boston_train, boston_test):
# train a random forest regressor with the specified number of trees
rf = RandomForestRegressor(numTrees = trees, labelCol="target")
model = rf.fit(boston_train)