View sf_crime_15.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cont, cat = cont_cat_split(train, max_card=5, dep_var="TargetedCategory") | |
cat.remove("Dates") | |
print("Continuous columns:") | |
for i in range(0, len(cont), 4): | |
print(' ' + ', '.join(cont[i:i+4])) | |
print("Categorical columns:") | |
for i in range(0, len(cat), 4): | |
print(' ' + ', '.join(cat[i:i+4])) |
View dataset-report.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!doctype html><html lang=en><head><meta charset=utf-8><meta name=viewport content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name=description content="Profile report generated with the `pandas-profiling` Python package"><meta name=author content="Simon Brugman and the open source community."><meta name=generator content="Pandas Profiling v2.9.0"><meta name=url content=https://github.com/pandas-profiling/pandas-profiling><meta name=date content="2020-10-30 00:23:09.831154"><title>SF Crime Data Set Profile</title><style> | |
/*! | |
* Bootstrap v3.3.7 (http://getbootstrap.com) | |
* Copyright 2011-2016 Twitter, Inc. | |
* Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) | |
*//*! normalize.css v3.0.3 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section,summary{display:block}audio,canvas,progress,video{dis |
View sf_crime_17.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fpr, tpr, thresholds = roc_curve(y_validation, y_pred) | |
fig = make_subplots( | |
rows=1, cols=2, | |
subplot_titles=( | |
"ROC Curve", | |
"Precision vs Recall Curve" | |
) | |
) |
View sf_crime_16.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
c = confusion_matrix(y_validation, y_pred > 0.5, normalize='true') | |
fig = ff.create_annotated_heatmap( | |
c, | |
x=['Not Target', 'Target'], | |
y=['Not Target', 'Target'], | |
colorscale="Greens" | |
) | |
fig.update_xaxes(side="top", title="Prediction") | |
fig.update_yaxes(title="Truth") | |
fig.show() |
View sf_crime_15.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig = make_subplots( | |
rows=1, cols=2, | |
subplot_titles=( | |
"Classwise Score Distributions", | |
"Train vs Validation Score Distributions" | |
) | |
) | |
# class-wise score distributions | |
fig_distplots = ff.create_distplot( |
View sf_crime_14.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
params = { | |
'boosting': 'gbdt', | |
'objective': 'binary', | |
'is_unbalance': True, | |
'num_class': 1, | |
'learning_rate': 0.1, | |
} | |
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['PdDistrict']) | |
model = lgb.train(params, train_data, 250) | |
y_train_pred = model.predict(X_train) |
View sf_crime_13.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
X_train, y_train = to.train.xs, to.train.ys.values.ravel() | |
X_validation, y_validation = to.valid.xs, to.valid.ys.values.ravel() | |
mask_positive_class = (y_validation == 1) | |
print(f"The train set has {np.bincount(y_train)[1]} positive labels.") | |
print(f"The validation set has {np.bincount(y_validation)[1]} positive labels.") |
View sf_crime_12.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def time_split(df, validation_pct=0.2): | |
df = df.sort_values("Dates") | |
split_date = df.loc[df.index[int(len(df) * (1 - validation_pct))], "Dates"] | |
return df.index[df["Dates"] <= split_date], df.index[df["Dates"] > split_date] | |
train_idx, validation_idx = time_split(train, validation_pct=0.2) | |
print(f"Training data has {len(train_idx)} samples from {train.loc[train_idx, 'Dates'].min()} to {train.loc[train_idx, 'Dates'].max()}") | |
print(f"Validation data has {len(validation_idx)} samples from {train.loc[validation_idx, 'Dates'].min()} to {train.loc[validation_idx, 'Dates'].max()}") |
View sf_crime_11.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cont, cat = cont_cat_split(train, max_card=5, dep_var="TargetedCategory") | |
cat.remove("Dates") | |
print(f"{len(cont)} Continuous columns: {cont}") | |
print(f"{len(cat)} categorical columns: {cat}") |
View sf_crime_10.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
old_columns = train.columns | |
train = add_datepart(train, "Dates", drop=False) | |
new_columns = list(set(train.columns) - set(old_columns)) | |
print(f"add_datepart created {len(new_columns)} created new features") | |
for i, new_column in enumerate(new_columns): | |
print(f" {i + 1}. {new_column}") |
NewerOlder