Skip to content

Instantly share code, notes, and snippets.

View stasSajin's full-sized avatar

Stas Sajin stasSajin

  • Earnest
  • San Francisco
View GitHub Profile
def estimate_rf(df):
X = df.income.values.reshape(-1,1)
y = df.minutes.values
rf_reg = RandomForestRegressor(100, n_jobs=8, oob_score=True)
rf_reg.fit(X,y)
# store residuals
y_new = df.minutes - reg.oob_prediction_
return y_new[df[group == 1]].mean() - y_new[df[group == 0]].mean()
def estimate_cuped(df):
# estimate the theta that minimizes the control variate
theta = df.cov().loc["income", "minutes"] / df.cov().loc["income", "income"]
y_new = df.minutes - theta * df.income
return y_new[df[group == 1]].mean() - y_new[df[group == 0]].mean()
@stasSajin
stasSajin / generate_data.py
Created June 25, 2020 20:42
Blogpost generate_data
def generate_data(group_size, effect_size = 0):
group1 = np.array([1] * group_size)
group2 = np.array([0] * group_size)
minutes = np.maximum(0, np.random.normal(25, 20, size = 2*group_size).astype(int))
income = (np.random.normal(10000, 2000, size = 2*group_size) * minutes).astype(int)
assets = (np.random.normal(100000, 20000, size = 2*group_size) * minutes).astype(int)
group_assignment = np.concatenate([group1, group2])
np.random.shuffle(group_assignment)
data = pd.DataFrame({'group': group_assignment,
'minutes': minutes,
from typing import Union, Dict
from enum import Enum
from pydantic.dataclasses import dataclass
from pydantic import ValidationError, validator
from pydantic import confloat, conint
class ProcedureType(Enum):
cancer = "cancer"
flu = "flu"
from hypothesis import strategies as st
from hypothesis import given
import pytest
@given(data=st.from_type(UserAssessment))
def test_apply_treatment(data):
result = apply_treatment(data)
# specify asserts in here
from typing import Optional, Union
from enum import Enum
from dataclasses import dataclass
from pydantic import ValidationError
class ProcedureType(Enum):
cancer = "cancer"
flu = "flu"
bias_var = np.array('gender_male')
bias_idx = np.argwhere(np.isin(np.array(feature_names), bias_var))[0]
interactions[:, bias_idx, :] = 0
interactions[:, :, bias_idx] = 0
y_hat_no_bias = np.exp(interactions.sum(axis = 1).sum(axis = 1)) - 1
feature_names = dtest.feature_names
interactions = xgb_model.predict(dtest, pred_interactions=True)
pd.DataFrame(interactions[0],
index = feature_names + ['intercept'],
columns= feature_names + ['intercept'])
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
data = pd.read_feather('data.feather')
data = pd.get_dummies(data, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(data.drop('salary', axis = 1),
np.log1p(data.salary.values),
test_size =.30,

Keybase proof

I hereby claim:

  • I am stassajin on github.
  • I am stassajin (https://keybase.io/stassajin) on keybase.
  • I have a public key ASCgFOn7OF-Xnk4rKYYLqm6FUhEGDxCmg9ba3Vd6ecGWTAo

To claim this, I am signing this object: