Skip to content

Instantly share code, notes, and snippets.

@WittmannF
Created January 11, 2020 18:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save WittmannF/7ac1af80ef586c2a6cb0aaca5e506824 to your computer and use it in GitHub Desktop.
Save WittmannF/7ac1af80ef586c2a6cb0aaca5e506824 to your computer and use it in GitHub Desktop.
## Imports
from ashrae_utils import reduce_mem_usage, CyclicLR, LRFinder
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import tqdm
import gc
from sklearn.linear_model import RidgeCV
import seaborn as sns
## Parameters
BENCHMARK_SUBS=True
PRINT_CORR_HEATMAP=False
REPLACE_LEAK=True # Replace leak data or not
DEBUG=False
PRINT_WEIGHTS = True
TYPE_PREDICTION = 'ridgecv' # Mean, Median, keras, ridgecv, errorinversionnormalized
submission_paths = [
'/kaggle/input/half-half-drop-rows-stratify-weekday/submission.csv', #1.105 --> 0.9446
'/kaggle/input/simple-data-cleanup-3-models/submission.csv',# 1.072
'/kaggle/input/ashrae-kfold-lightgbm-without-leak-1-08/submission.csv',
'/kaggle/input/another-1-08-lb-no-leak/fe2_lgbm.csv',
'/kaggle/input/ashrae-kfold-lightgbm-without-building-id/submission.csv', #1.098
'/kaggle/input/ashrae-energy-prediction-using-stratified-kfold/fe2_lgbm.csv', #1.074
'/kaggle/input/ashrae-lightgbm-without-leak/submission.csv', #1.082
'/kaggle/input/ashrae-stratified-kfold-lightgbm/submission.csv', #1.075
'/kaggle/input/ashrae-2-lightgbm-without-leak-data/submission.csv',
]
## Functions
def rmse(y_true, y_pred):
return np.sqrt(np.mean((y_true-y_pred)**2))
def rmsle(y_true, y_pred):
return np.sqrt(np.mean((np.log1p(y_true)-np.log1p(y_pred))**2))
def read_submissions():
print('## Reading submissions')
subs = []
for i, path in enumerate(submission_paths):
print(f'Reading {path}')
sub = pd.read_csv(path)
sub.columns = ['row_id', f'meter_reading_{i}']
subs.append(sub[f'meter_reading_{i}'])
subs = pd.concat(subs, axis=1)
subs['row_id'] = sub.row_id
subs = reduce_mem_usage(subs)
sub = reduce_mem_usage(sub)
sub.columns = ['row_id', 'meter_reading']
return sub, subs
def read_leak():
y_test = pd.read_csv('/kaggle/input/leak-test-set/y_test.csv', names=['meter_reading'], index_col=0)
y_test['meter_reading'] = np.clip(y_test['meter_reading'], 0, None)
return y_test
def leak_benchmark(sub):
print("## Comparing predictions against leak data")
y_test = read_leak()
rmsle_error = rmsle(y_test.values.T[0], sub['meter_reading'][y_test.index].values)
print(f'RMSLE in the leak data is {rmsle_error}')
if REPLACE_LEAK:
print("## Replacing predictions with leak data")
sub['meter_reading'][y_test.index] = y_test['meter_reading']
return sub
def read_X_test():
X_test = pd.read_feather('/kaggle/input/ashrae-feather-format-for-fast-loading/test.feather')
X_test = X_test.set_index('row_id')
return X_test
def prepare_X(X):
X = np.log1p(X)
return X
def benchmark_subs(X, y):
for i, col in enumerate(X.columns):
print(f"Benchmarking {submission_paths[i].split('/')[-2]}")
score = rmse(X[col].values, y['meter_reading'].values)
print(f"RMSLE is {score}")
def ridgecv_predict(subs):
#X, y = get_X_y(subs)
y = read_leak()
X = subs.iloc[y.index, :len(submission_paths)]
X = prepare_X(X)
y = np.log1p(y)
if BENCHMARK_SUBS:
benchmark_subs(X, y)
if PRINT_CORR_HEATMAP:
sns_plot = sns.heatmap(pd.concat([X, y], axis=1).corr(), annot=True)
sns_plot.savefig("corr_w_gt.png")
reg = RidgeCV(alphas = RIDGE_ALPHAS).fit(X, y)
if PRINT_WEIGHTS:
print("## Ridge Coefficients")
print(f'Sum of coefficients: {sum(reg.coef_[0])}')
for ww, ss in zip(reg.coef_[0], submission_paths):
print(f'{ss.split("/")[-2]} has weight {ww:.2f}')
X = subs.iloc[:, :len(submission_paths)]
X = prepare_X(X)
y_pred = reg.predict(X)
y_pred = y_pred.T[0]
y_pred = np.clip(y_pred, 0, None)
y_pred = np.expm1(y_pred)
return y_pred
def predict(subs, **kwargs):
if TYPE_PREDICTION=='ridgecv':
return ridgecv_predict(subs)
def export(sub):
if not DEBUG:
print('## Saving to CSV')
sub.to_csv('submission.csv', index=False, float_format='%g')
## Main Function
if __name__=='__main__':
# 1. Reading Data
sub, subs = read_submissions()
# 2. Predicting
sub['meter_reading'] = predict(subs)
# 3. Leak correction
sub = leak_benchmark(sub)
# 4. Export Submission
export(sub)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment