Skip to content

Instantly share code, notes, and snippets.

Avatar
😸
so make it

Anthony Agnone aagnone3

😸
so make it
View GitHub Profile
View sf_crime_15.py
cont, cat = cont_cat_split(train, max_card=5, dep_var="TargetedCategory")
cat.remove("Dates")
print("Continuous columns:")
for i in range(0, len(cont), 4):
print(' ' + ', '.join(cont[i:i+4]))
print("Categorical columns:")
for i in range(0, len(cat), 4):
print(' ' + ', '.join(cat[i:i+4]))
View dataset-report.html
<!doctype html><html lang=en><head><meta charset=utf-8><meta name=viewport content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name=description content="Profile report generated with the `pandas-profiling` Python package"><meta name=author content="Simon Brugman and the open source community."><meta name=generator content="Pandas Profiling v2.9.0"><meta name=url content=https://github.com/pandas-profiling/pandas-profiling><meta name=date content="2020-10-30 00:23:09.831154"><title>SF Crime Data Set Profile</title><style>
/*!
* Bootstrap v3.3.7 (http://getbootstrap.com)
* Copyright 2011-2016 Twitter, Inc.
* Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
*//*! normalize.css v3.0.3 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section,summary{display:block}audio,canvas,progress,video{dis
View sf_crime_17.py
fpr, tpr, thresholds = roc_curve(y_validation, y_pred)
fig = make_subplots(
rows=1, cols=2,
subplot_titles=(
"ROC Curve",
"Precision vs Recall Curve"
)
)
View sf_crime_16.py
c = confusion_matrix(y_validation, y_pred > 0.5, normalize='true')
fig = ff.create_annotated_heatmap(
c,
x=['Not Target', 'Target'],
y=['Not Target', 'Target'],
colorscale="Greens"
)
fig.update_xaxes(side="top", title="Prediction")
fig.update_yaxes(title="Truth")
fig.show()
View sf_crime_15.py
fig = make_subplots(
rows=1, cols=2,
subplot_titles=(
"Classwise Score Distributions",
"Train vs Validation Score Distributions"
)
)
# class-wise score distributions
fig_distplots = ff.create_distplot(
View sf_crime_14.py
params = {
'boosting': 'gbdt',
'objective': 'binary',
'is_unbalance': True,
'num_class': 1,
'learning_rate': 0.1,
}
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['PdDistrict'])
model = lgb.train(params, train_data, 250)
y_train_pred = model.predict(X_train)
View sf_crime_13.py
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_validation, y_validation = to.valid.xs, to.valid.ys.values.ravel()
mask_positive_class = (y_validation == 1)
print(f"The train set has {np.bincount(y_train)[1]} positive labels.")
print(f"The validation set has {np.bincount(y_validation)[1]} positive labels.")
View sf_crime_12.py
def time_split(df, validation_pct=0.2):
df = df.sort_values("Dates")
split_date = df.loc[df.index[int(len(df) * (1 - validation_pct))], "Dates"]
return df.index[df["Dates"] <= split_date], df.index[df["Dates"] > split_date]
train_idx, validation_idx = time_split(train, validation_pct=0.2)
print(f"Training data has {len(train_idx)} samples from {train.loc[train_idx, 'Dates'].min()} to {train.loc[train_idx, 'Dates'].max()}")
print(f"Validation data has {len(validation_idx)} samples from {train.loc[validation_idx, 'Dates'].min()} to {train.loc[validation_idx, 'Dates'].max()}")
View sf_crime_11.py
cont, cat = cont_cat_split(train, max_card=5, dep_var="TargetedCategory")
cat.remove("Dates")
print(f"{len(cont)} Continuous columns: {cont}")
print(f"{len(cat)} categorical columns: {cat}")
View sf_crime_10.py
old_columns = train.columns
train = add_datepart(train, "Dates", drop=False)
new_columns = list(set(train.columns) - set(old_columns))
print(f"add_datepart created {len(new_columns)} created new features")
for i, new_column in enumerate(new_columns):
print(f" {i + 1}. {new_column}")