Skip to content

Instantly share code, notes, and snippets.

View eugeneyan's full-sized avatar
👨‍💻
Learning. Building. Writing.

Eugene Yan eugeneyan

👨‍💻
Learning. Building. Writing.
View GitHub Profile
@eugeneyan
eugeneyan / titanic-data-set-sample
Created February 21, 2021 19:02
Titanic dataset sample
+ ------------+----------+--------+-----------------------------------------+--------+-----+-------+-------+-----------+---------+-------+----------+
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
+ ------------+----------+--------+-----------------------------------------+--------+-----+-------+-------+-----------+---------+-------+----------|
| 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22 | 1 | 0 | A/5 21171 | 7.25 | nan | S |
| 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence... | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26 | 0 | 0 | STON/O2. | 7.925 | nan | S |
| 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily M... | female | 35 | 1 | 0 | 113803 |
@eugeneyan
eugeneyan / test_gini.py
Last active February 21, 2021 19:06
Test Gini impurity and gain
def test_gini_impurity():
assert round(gini_impurity([1, 1, 1, 1, 1, 1, 1, 1]), 3) == 0
assert round(gini_impurity([1, 1, 1, 1, 1, 1, 1, 0]), 3) == 0.219
assert round(gini_impurity([1, 1, 1, 1, 1, 1, 0, 0]), 3) == 0.375
assert round(gini_impurity([1, 1, 1, 1, 1, 0, 0, 0]), 3) == 0.469
assert round(gini_impurity([1, 1, 1, 1, 0, 0, 0, 0]), 3) == 0.500
def test_gini_gain():
assert round(gini_gain([1, 1, 1, 1, 0, 0, 0, 0], [[1, 1, 1, 1], [0, 0, 0, 0]]), 3) == 0.5
@eugeneyan
eugeneyan / test_output.py
Created February 21, 2021 19:05
Test output
def test_dt_output_shape(dummy_titanic):
X_train, y_train, X_test, y_test = dummy_titanic
dt = DecisionTree()
dt.fit(X_train, y_train)
pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)
assert pred_train.shape == (X_train.shape[0],), 'DecisionTree output should be same as training labels.'
assert pred_test.shape == (X_test.shape[0],), 'DecisionTree output should be same as testing labels.'
@eugeneyan
eugeneyan / test_output_range.py
Created February 21, 2021 19:06
Test output range
def test_dt_output_range(dummy_titanic):
X_train, y_train, X_test, y_test = dummy_titanic
dt = DecisionTree()
dt.fit(X_train, y_train)
pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)
assert (pred_train <= 1).all() & (pred_train >= 0).all(), 'Decision tree output should range from 0 to 1 inclusive'
assert (pred_test <= 1).all() & (pred_test >= 0).all(), 'Decision tree output should range from 0 to 1 inclusive'
@eugeneyan
eugeneyan / test_data_leak.py
Created February 21, 2021 19:07
Test data leak
def test_data_leak_in_test_data(dummy_titanic_df):
train, test = dummy_titanic_df
concat_df = pd.concat([train, test])
concat_df.drop_duplicates(inplace=True)
assert concat_df.shape[0] == train.shape[0] + test.shape[0]
@eugeneyan
eugeneyan / test_model_can_overfit.py
Created February 21, 2021 19:08
Test model can overfit
@pytest.fixture
def dummy_feats_and_labels():
feats = np.array([[0.7057, -5.4981, 8.3368, -2.8715],
[2.4391, 6.4417, -0.80743, -0.69139],
[-0.2062, 9.2207, -3.7044, -6.8103],
[4.2586, 11.2962, -4.0943, -4.3457],
[-2.343, 12.9516, 3.3285, -5.9426],
[-2.0545, -10.8679, 9.4926, -1.4116],
[2.2279, 4.0951, -4.8037, -2.1112],
[-6.1632, 8.7096, -0.21621, -3.6345],
@eugeneyan
eugeneyan / test_depth_acc.py
Created February 21, 2021 19:09
Test Tree depth leads to increased accuracy
def test_dt_increase_acc(dummy_titanic):
X_train, y_train, _, _ = dummy_titanic
acc_list = []
auc_list = []
for depth in range(1, 10):
dt = DecisionTree(depth_limit=depth)
dt.fit(X_train, y_train)
pred = dt.predict(X_train)
pred_binary = np.round(pred)
acc_list.append(accuracy_score(y_train, pred_binary))
@eugeneyan
eugeneyan / test_invariance.py
Created February 21, 2021 19:10
Test invariance
@pytest.fixture
def dummy_passengers():
# Based on passenger 2 (high passenger class female)
passenger2 = {'PassengerId': 2,
'Pclass': 1,
'Name': ' Mrs. John',
'Sex': 'female',
'Age': 38.0,
'SibSp': 1,
'Parch': 0,
@eugeneyan
eugeneyan / test_directional_expectation.py
Last active February 21, 2021 19:11
Test directional expectation
def test_dt_directional_expectation(dummy_titanic_dt, dummy_passengers):
model = dummy_titanic_dt
_, p2 = dummy_passengers
# Get original survival probability of passenger 2
test_df = pd.DataFrame.from_dict([p2], orient='columns')
X, y = get_feats_and_labels(prep_df(test_df))
p2_prob = model.predict(X)[0] # 1.0
# Change gender from female to male
@eugeneyan
eugeneyan / test_evalution.py
Created February 21, 2021 19:13
Test evaluation
def test_dt_evaluation(dummy_titanic_dt, dummy_titanic):
model = dummy_titanic_dt
X_train, y_train, X_test, y_test = dummy_titanic
pred_test = model.predict(X_test)
pred_test_binary = np.round(pred_test)
acc_test = accuracy_score(y_test, pred_test_binary)
auc_test = roc_auc_score(y_test, pred_test)
assert acc_test > 0.82, 'Accuracy on test should be > 0.82'
assert auc_test > 0.84, 'AUC ROC on test should be > 0.84'