This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(boot) | |
data <- read.csv("UserSessions.csv") | |
# Function for computing the difference of differences | |
run_DiD <- function(data, indices){ | |
d <- data[indices,] | |
new <- mean(d$postval[d$group=='Test'])/mean(d$priorval[d$group=='Test']) | |
old <-mean(d$postval[d$expgroup=='Control'])/mean(d$priorval[d$expgroup=='Control']) | |
return((new - old)/old * 100.0) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(CausalImpact) | |
data <- read.csv(file = "DailySessions.csv") | |
# Create a DataFrame and plot the input data | |
ts <- cbind(data$test, data$control) | |
matplot(ts, type = "l") | |
# Use two week prior and post periods and plot results | |
pre.period <- c(1, 14) | |
post.period <- c(15, 30) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import featuretools as ft | |
rawEventsDF = ... # load from data warehouse | |
# 1-hot encode the raw event data | |
es = ft.EntitySet(id="events") | |
es = es.entity_from_dataframe(entity_id="events", dataframe=rawDataDF) | |
feature_matrix, defs = ft.dfs(entityset=es, target_entity="events", max_depth=1) | |
encodedDF, encoders = ft.encode_features(feature_matrix, defs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create feature encodings for the event and description fields | |
es = ft.EntitySet(id="plays") | |
es = es.entity_from_dataframe(entity_id="plays", dataframe=plays_df, index="play_id", | |
variable_types = { "event": ft.variable_types.Categorical, | |
"description": ft.variable_types.Categorical }) | |
f1 = Feature(es["plays"]["event"]) | |
f2 = Feature(es["plays"]["description"]) | |
encoded, _= ft.encode_features(plays_df, [f1, f2], top_n=10) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
# load the boston data set | |
from sklearn.datasets import load_boston | |
boston = load_boston() | |
# convert to a Pandas Data Frame | |
boston_pd = pd.DataFrame(data= np.c_[boston['data'],boston['target']], | |
columns= np.append(boston['feature_names'], 'target')).sample(frac=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# linear regresion with Spark | |
from pyspark.ml.regression import LinearRegression | |
# linear regression | |
lr = LinearRegression(maxIter=10, regParam=0.1, | |
elasticNetParam=0.5, labelCol="target") | |
# Fit the model | |
model = lr.fit(boston_train) | |
boston_pred = model.transform(boston_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.ml.feature import VectorAssembler | |
# convert to a Spark data frame | |
boston_sp = spark.createDataFrame(boston_pd) | |
display(boston_sp.take(5)) | |
# split into training and test spark data frames | |
boston_train = spark.createDataFrame(boston_pd[:400]) | |
boston_test = spark.createDataFrame(boston_pd[400:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder | |
from pyspark.ml.evaluation import RegressionEvaluator | |
crossval = CrossValidator(estimator=LinearRegression(labelCol = "target"), | |
estimatorParamMaps=ParamGridBuilder().addGrid( | |
LinearRegression.elasticNetParam, [0, 0.5, 1.0]).build(), | |
evaluator=RegressionEvaluator( | |
labelCol = "target", metricName = "r2"), | |
numFolds=10) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# sklearn version | |
from sklearn.ensemble import RandomForestRegressor as RFR | |
from multiprocessing.pool import ThreadPool | |
# allow up to 5 concurrent threads | |
pool = ThreadPool(5) | |
# hyperparameters to test out (n_trees) | |
parameters = [ 10, 20, 50] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# spark version | |
from pyspark.ml.regression import RandomForestRegressor | |
# define a function to train a RF model and return metrics | |
def mllib_random_forest(trees, boston_train, boston_test): | |
# train a random forest regressor with the specified number of trees | |
rf = RandomForestRegressor(numTrees = trees, labelCol="target") | |
model = rf.fit(boston_train) |
OlderNewer