Skip to content

Instantly share code, notes, and snippets.

View shawlu95's full-sized avatar

Shaw Lu shawlu95

View GitHub Profile
@shawlu95
shawlu95 / 05_sql_feature_engineering_pull_func.py
Last active April 9, 2019 21:15
05_sql_feature_engineering
def load_dataset(split="trn_set", limit=None, ignore_categorical=False):
sql = """
SELECT o.*, f1.*, f2.*, f3.*, f4.*,
EXTRACT(MONTH FROM o.dt) AS month
FROM %s AS t
JOIN Online AS o
ON t.index = o.index
JOIN features_group_1 AS f1
ON t.index = f1.index
JOIN features_group_2 AS f2
mysql> show tables;
+----------------------+
| Tables_in_shutterfly |
+----------------------+
| Online |
| Purchase |
| features_group_1 |
| features_group_2 |
| features_group_3 |
| features_group_4 |
@shawlu95
shawlu95 / 05_sql_feature_engineering_group1.sql
Last active April 9, 2019 22:24
05_sql_feature_engineering
USE Shutterfly;
DROP TABLE IF EXISTS features_group_1;
CREATE TABLE IF NOT EXISTS features_group_1
SELECT o.index
,LEFT(o.dt, 10) AS day
,COUNT(*) AS order_count
,SUM(p.revenue) AS revenue_sum
,MAX(p.revenue) AS revenue_max
@shawlu95
shawlu95 / 05_sql_feature_engineering_check_split.txt
Last active April 9, 2019 19:36
05_sql_feature_engineering
mysql> select count(*) from trn_set;
+----------+
| count(*) |
+----------+
| 859296 |
+----------+
1 row in set (0.61 sec)
mysql> select count(*) from tst_set;
+----------+
@shawlu95
shawlu95 / 05_sql_feature_engineering_split.py
Last active April 9, 2019 19:33
05_sql_feature_engineering
sql = "SELECT `index`, event2 FROM Online;"
df = pd.read_sql_query(sql, engine).set_index('index')
# shuffle dataset, preserving index
df = df.sample(frac=1)
train_frac = 0.9
test_frac = 1 - train_frac
trn_cutoff = int(len(df) * train_frac)
@shawlu95
shawlu95 / 05_sql_feature_engineering_table.txt
Last active April 9, 2019 19:02
05_sql_feature_engineering
mysql> use shutterfly;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A
Database changed
mysql> show tables;
+----------------------+
| Tables_in_shutterfly |
+----------------------+
@shawlu95
shawlu95 / 05_sql_feature_engineering.py
Last active April 12, 2019 19:29
05_sql_feature_engineering
from sqlalchemy import create_engine
import pandas as pd
username = "root"
password = "1234567"
port = 3306
database = "Shutterfly"
engine = create_engine('mysql+mysqldb://%s:%s@localhost:%i/%s'
%(username, password, port, database))
# two-tailed t-test
h_0 = 300
h_1 = 290
n = 50
se = 20 / np.sqrt(n)
power = compute_power(h_0, h_1, se, tail="two")
print("power: %.3f, beta: %.3f"%(power, 1 - power))
# one-tailed z-test
h_0 = 0.8
h_1 = 0.75
n = 100
se = np.sqrt(h_0 * (1 - h_0) / n)
power = compute_power(h_0, h_1, se, tail="left")
print("power: %.3f, beta: %.3f"%(power, 1 - power))
@shawlu95
shawlu95 / 04_power.py
Created April 9, 2019 06:35
04_power
def critical_z(alpha=0.05, tail="two"):
"""
Given significance level, compute critical value.
"""
if tail == "two":
p = 1 - alpha / 2
else:
p = 1 - alpha
return norm.ppf(p)