-
-
Save bahrunnur/8963237d721366baaab1 to your computer and use it in GitHub Desktop.
My solution for DMC 2015
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import time | |
import numpy as np | |
import pandas as pd | |
def sigmoid(x): | |
"""Sigmoid Function.""" | |
return 1.0 / (1.0 + np.exp(-x)) | |
def softmax(col): | |
"""Normalizing column with softmax function. | |
This function can be used to convert numerical features to [0,1] interval. | |
""" | |
val = col.values | |
val = (val - np.mean(val)) / np.std(val) | |
return sigmoid(val) | |
def deal_with_numeric(df): | |
"""Deal with numeric field. | |
Normalizing value with Log Normalization. | |
Why Log Normalization? | |
====================== | |
Log Normalization perform better than softmax because this numerical column | |
has positive skew distribution. Applying log will penalize large value. So | |
it could make variance lower. | |
Source: http://stats.stackexchange.com/q/298 | |
""" | |
numeric = [ 'price1', 'basePrice1', 'reward1', | |
'price2', 'basePrice2', 'reward2', | |
'price3', 'basePrice3', 'reward3', ] | |
df['log_sum_price'] = np.log(df.price1 + df.price2 + df.price3 + 1) | |
print(" --- Dealing with numeric. Log Transform.") | |
for col in numeric: | |
df[col] = np.log(df[col] + 1) | |
print(" --- Dealing with numeric. Calculate diff price.") | |
# this is for coupons features | |
df['diffPrice1_abs'] = np.abs(df['price1'] - df['basePrice1']) | |
df['diffPrice2_abs'] = np.abs(df['price2'] - df['basePrice2']) | |
df['diffPrice3_abs'] = np.abs(df['price3'] - df['basePrice3']) | |
# and using aggregated for comparation | |
df['diffPrice1'] = df['price1'] - df['basePrice1'] | |
df['diffPrice2'] = df['price2'] - df['basePrice2'] | |
df['diffPrice3'] = df['price3'] - df['basePrice3'] | |
df['diffPrice1'] = df['diffPrice1'] + np.abs(df['diffPrice1'].min()) | |
df['diffPrice2'] = df['diffPrice2'] + np.abs(df['diffPrice2'].min()) | |
df['diffPrice3'] = df['diffPrice3'] + np.abs(df['diffPrice3'].min()) | |
df['diffPrice'] = df['diffPrice1'] + df['diffPrice2'] + df['diffPrice3'] | |
useless = [ 'price1', 'basePrice1', 'diffPrice1', | |
'price2', 'basePrice2', 'diffPrice2', | |
'price3', 'basePrice3', 'diffPrice3', ] | |
df = df.drop(useless, axis=1) | |
print(" --- Finished dealing with numeric.") | |
return df | |
def prior_postprocess(df): | |
df['prior_using_coupon_rate'] = df['prior_using_coupon'] / df['prior_transactions'] | |
df['prior_using_coupon_rate'] = df['prior_using_coupon_rate'].replace(np.inf, 0) | |
df['prior_using_coupon_rate'] = df['prior_using_coupon_rate'].replace(np.NaN, 0) | |
df['prior_couponID1_used_rate'] = df['prior_couponID1_used'] / df['prior_couponID1_population'] | |
df['prior_couponID1_used_rate'] = df['prior_couponID1_used_rate'].replace(np.inf, 0) | |
df['prior_couponID1_used_rate'] = df['prior_couponID1_used_rate'].replace(np.NaN, 0) | |
df['prior_couponID2_used_rate'] = df['prior_couponID2_used'] / df['prior_couponID2_population'] | |
df['prior_couponID2_used_rate'] = df['prior_couponID2_used_rate'].replace(np.inf, 0) | |
df['prior_couponID2_used_rate'] = df['prior_couponID2_used_rate'].replace(np.NaN, 0) | |
df['prior_couponID3_used_rate'] = df['prior_couponID3_used'] / df['prior_couponID3_population'] | |
df['prior_couponID3_used_rate'] = df['prior_couponID3_used_rate'].replace(np.inf, 0) | |
df['prior_couponID3_used_rate'] = df['prior_couponID3_used_rate'].replace(np.NaN, 0) | |
# df['willing_to_use_coupon1'] = (df['prior_using_coupon_rate'] + df['prior_couponID1_used_rate']) - (df['prior_using_coupon_rate'] * df['prior_couponID1_used_rate']) | |
# df['willing_to_use_coupon2'] = (df['prior_using_coupon_rate'] + df['prior_couponID2_used_rate']) - (df['prior_using_coupon_rate'] * df['prior_couponID2_used_rate']) | |
# df['willing_to_use_coupon3'] = (df['prior_using_coupon_rate'] + df['prior_couponID3_used_rate']) - (df['prior_using_coupon_rate'] * df['prior_couponID3_used_rate']) | |
df['prior_mean_basketValue'] = df['prior_mean_basketValue'].replace(np.NaN, 0) | |
df['prior_mean_basketValue'] = np.log(df['prior_mean_basketValue'] + 1) | |
df['prior_std_basketValue'] = df['prior_std_basketValue'].replace(np.NaN, 0) | |
df['prior_std_basketValue'] = np.log(df['prior_std_basketValue'] + 1) | |
return df | |
def prior_analysis(df, silent=True): | |
"""Prior Analysis: | |
2. User Prior Transactions | |
3. Coupon ID prior used | |
4. Prior mean basketValue based on user | |
""" | |
df['totalCouponUsed'] = df['coupon1Used'] + df['coupon2Used'] + df['coupon3Used'] | |
for idx, row in df.iterrows(): | |
prior = df[(df.orderTime < row.orderTime)] | |
# 2. | |
pt = prior[(prior.userID == row.userID)] | |
pcu = pt[pt.totalCouponUsed > 0] | |
df.loc[idx, 'prior_transactions'] = pt.shape[0] | |
df.loc[idx, 'prior_using_coupon'] = pcu.shape[0] | |
# 3. | |
pcid1_1 = prior[(prior.couponID1 == row.couponID1)] | |
pcid1_2 = prior[(prior.couponID2 == row.couponID1)] | |
pcid1_3 = prior[(prior.couponID3 == row.couponID1)] | |
pcid2_1 = prior[(prior.couponID1 == row.couponID2)] | |
pcid2_2 = prior[(prior.couponID2 == row.couponID2)] | |
pcid2_3 = prior[(prior.couponID3 == row.couponID2)] | |
pcid3_1 = prior[(prior.couponID1 == row.couponID3)] | |
pcid3_2 = prior[(prior.couponID2 == row.couponID3)] | |
pcid3_3 = prior[(prior.couponID3 == row.couponID3)] | |
pcid1 = pcid1_1.shape[0] + pcid1_2.shape[0] + pcid1_3.shape[0] | |
pcid2 = pcid2_1.shape[0] + pcid2_2.shape[0] + pcid2_3.shape[0] | |
pcid3 = pcid3_1.shape[0] + pcid3_2.shape[0] + pcid3_3.shape[0] | |
df.loc[idx, 'prior_couponID1_population'] = pcid1 | |
df.loc[idx, 'prior_couponID2_population'] = pcid2 | |
df.loc[idx, 'prior_couponID3_population'] = pcid3 | |
pcid1_1_used = pcid1_1[pcid1_1.coupon1Used > 0] | |
pcid1_2_used = pcid1_2[pcid1_2.coupon2Used > 0] | |
pcid1_3_used = pcid1_3[pcid1_3.coupon3Used > 0] | |
pcid2_1_used = pcid2_1[pcid2_1.coupon1Used > 0] | |
pcid2_2_used = pcid2_2[pcid2_2.coupon2Used > 0] | |
pcid2_3_used = pcid2_3[pcid2_3.coupon3Used > 0] | |
pcid3_1_used = pcid3_1[pcid3_1.coupon1Used > 0] | |
pcid3_2_used = pcid3_2[pcid3_2.coupon2Used > 0] | |
pcid3_3_used = pcid3_3[pcid3_3.coupon3Used > 0] | |
pcid1_used = pcid1_1_used.shape[0] + pcid1_2_used.shape[0] + pcid1_3_used.shape[0] | |
pcid2_used = pcid2_1_used.shape[0] + pcid2_2_used.shape[0] + pcid2_3_used.shape[0] | |
pcid3_used = pcid3_1_used.shape[0] + pcid3_2_used.shape[0] + pcid3_3_used.shape[0] | |
df.loc[idx, 'prior_couponID1_used'] = pcid1_used | |
df.loc[idx, 'prior_couponID2_used'] = pcid2_used | |
df.loc[idx, 'prior_couponID3_used'] = pcid3_used | |
# 4. | |
pmbv = pt['basketValue'].mean() | |
pstdmbv = pt['basketValue'].std() | |
df.loc[idx, 'prior_mean_basketValue'] = pmbv | |
df.loc[idx, 'prior_std_basketValue'] = pstdmbv | |
if not silent: | |
print(" --- Finished processing prior at row: #{}".format(idx)) | |
df = prior_postprocess(df) | |
return df | |
def deal_with_time(df): | |
"""Dealing with Time features. | |
Time series analysis: | |
1. Create delta time column. Indicating for how long the user has that | |
coupons. (in seconds) | |
6. What day is today? (Monday, Sunday, etc) | |
""" | |
print(" --- Dealing with Times.") | |
# 1. Create delta time column. Indicating for how long the user has that | |
# coupons. (in seconds) | |
date_format = "%Y-%m-%d %H:%M:%S" | |
order_time = df['orderTime'].apply(lambda x: time.mktime(time.strptime(x, date_format))) | |
coupons_received = df['couponsReceived'].apply(lambda x: time.mktime(time.strptime(x, date_format))) | |
df['coupon_delta'] = order_time - coupons_received | |
df['coupon_delta'] = np.log(df['coupon_delta'] + 1) | |
# entahlah, tapi ini punya korelasi lumayan buat ke basketValue | |
df['pressure'] = df['log_sum_price'] * df['coupon_delta'] | |
# Convert to pandas datetime | |
df['orderTime'] = pd.to_datetime(df['orderTime']) | |
df['couponsReceived'] = pd.to_datetime(df['couponsReceived']) | |
# Prior Analysis | |
df = prior_analysis(df) | |
# 6. day of week | |
# df['dayofweek'] = df['orderTime'].dt.dayofweek | |
dayofweek = pd.get_dummies(df['orderTime'].dt.dayofweek, prefix='dayofweek') | |
df = pd.concat([df, dayofweek], axis=1) | |
# drop da bitches | |
df = df.drop(['orderTime', 'couponsReceived', 'totalCouponUsed'], axis=1) | |
print(" --- Finished dealing with Times.") | |
return df | |
def deal_with_brands(df): | |
"""Deal with Brands features. | |
Just transform it into dummy variable. | |
""" | |
brands = ['brand1', 'brand2', 'brand3'] | |
# df[brands] = df[brands].fillna('missing') | |
print(" --- Dealing with Brands. Binarization.") | |
for col in brands: | |
category_binary = pd.get_dummies(df[col], prefix=col) | |
df = pd.concat([df, category_binary], axis=1) | |
df = df.drop(brands, axis=1) | |
print(" --- Finished dealing with Brands.") | |
return df | |
def deal_with_coupon_id(df): | |
"""Replacing coupon id with it's population number. | |
Another analysis is to use coupon used rate prior transaction time.""" | |
print(" --- Dealing with coupon id. Drop it.") | |
ids = ['couponID1', 'couponID2', 'couponID3'] | |
df = df.drop(ids, axis=1) | |
print(" --- Finished dealing with coupon id.") | |
return df | |
def deal_with_user_id(df): | |
"""User analysis""" | |
print(" --- Dealing with Users. Binarization.") | |
users = pd.get_dummies(df['userID'], prefix='userID') | |
df = pd.concat([df, users], axis=1) | |
df = df.drop('userID', axis=1) | |
print(" --- Finished dealing with Users.") | |
return df | |
def deal_with_product_group(df): | |
"""Product Group Transformation.""" | |
print(" --- Dealing with Product Group. Dropping.") | |
cols = ['productGroup1', 'productGroup2', 'productGroup3' ] | |
df = df.drop(cols, axis=1) | |
print(" --- Finished dealing with Product Group.") | |
return df | |
def deal_with_category(df): | |
"""Deal with category field. | |
Doing binarization for this field. Doing other things to id based column. | |
""" | |
df = deal_with_coupon_id(df) | |
df = deal_with_user_id(df) | |
df = deal_with_product_group(df) | |
print(" --- Finished dealing with category.") | |
return df | |
def init(df): | |
"""Base trainsformation of features.""" | |
x = df.categoryIDs1.str.get_dummies(sep=',') | |
y = df.categoryIDs2.str.get_dummies(sep=',') | |
z = df.categoryIDs3.str.get_dummies(sep=',') | |
xcols = 'categoryIDs1_' + x.columns.values | |
ycols = 'categoryIDs2_' + y.columns.values | |
zcols = 'categoryIDs3_' + z.columns.values | |
x.columns = xcols | |
y.columns = ycols | |
z.columns = zcols | |
indc = pd.concat([x, y, z], axis=1) | |
procs = df.drop(['categoryIDs1', 'categoryIDs2', 'categoryIDs3'], axis=1) | |
procs = pd.concat([procs, indc], axis=1) | |
pcols = procs.columns.tolist() | |
pcols = pcols[:11] + xcols.tolist() + pcols[11:18] + ycols.tolist() + pcols[18:25] + zcols.tolist() + pcols[25:29] | |
procs = procs[pcols] | |
return procs | |
def main(): | |
trains = pd.read_csv('../data/processed_train.csv') | |
tests = pd.read_csv('../data/processed_test.csv') | |
# split based on df index | |
split = trains.shape[0] | |
df = pd.concat([trains, tests], axis=0, ignore_index=True) | |
# THE MAGIC :: > | |
# ========================================================================= | |
df = deal_with_numeric(df) | |
df = deal_with_time(df) | |
df = deal_with_brands(df) | |
df = deal_with_category(df) | |
# bring back y to the last column | |
y = ['coupon1Used', 'coupon2Used', 'coupon3Used', 'basketValue'] | |
targets = df[y] | |
df = df.drop(y, axis=1) | |
df = pd.concat([df, targets], axis=1) | |
proc_trains = df.iloc[:split] | |
proc_tests = df.iloc[split:] | |
proc_trains.to_csv('../data/train_gue.csv', index=False) | |
proc_tests.to_csv('../data/test_gue.csv', index=False) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script of my solution to DMC 2015 | |
Use this script in the following way: | |
python solution.py <name-of-submission> | |
Argument is optional, the script will assign default name. | |
""" | |
from __future__ import division | |
import sys | |
import pdb | |
import numpy as np | |
import pandas as pd | |
from scipy import stats | |
from sklearn.externals import joblib | |
from sklearn.decomposition import TruncatedSVD | |
from sklearn.linear_model import LogisticRegression, ElasticNet, Ridge, LinearRegression | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn import multiclass | |
from sklearn import cross_validation | |
from sklearn import grid_search | |
from scipy.stats.stats import pearsonr | |
from XGBoostClassifier import XGBoostClassifier | |
np.random.seed(17411) | |
def dmc_loss(y_true, y_pred, eps=1e-15): | |
"""DMC 2015 loss function. | |
y = coupon1, coupon2, coupon3, basketValue | |
# Ex: | |
>>> y_true = np.array([[ 1. , 1. , 0. , 187. ], | |
[ 0. , 1. , 0. , 132.5], | |
[ 1. , 0. , 0. , 83.4], | |
[ 0. , 0. , 1. , 50.3]]) | |
>>> y_pred = y_true.copy() | |
>>> loss = dmc_loss(y_true, y_pred) | |
4.7603819928347129 | |
`4.7603819928347129` is the zero error value. Manage to get score as close | |
as this value. | |
Parameters | |
---------- | |
y_true : array, shape = [n_samples, 4] | |
y_pred : array, shape = [n_samples, 4] | |
Returns | |
------- | |
loss : float | |
""" | |
# predictions = np.clip(y_pred, eps, 1 - eps) | |
predictions = y_pred | |
vsota = np.square(np.abs(y_true - predictions) / np.mean(y_true, axis=0)) | |
loss = np.sum(vsota) | |
return loss | |
def time_series_validation(n, step): | |
"""Cross validation for time-series data.""" | |
leap = 100; k = 2200; h = 650 | |
indices = np.arange(n) | |
for i in range(step): | |
yield indices[:k], indices[k:k+h] | |
k += leap | |
BASKET_VALUE_FEATURES = [ | |
'pressure', 'prior_mean_basketValue', 'prior_std_basketValue', | |
'prior_using_coupon_rate', | |
'dayofweek_0', 'dayofweek_1', 'dayofweek_2', | |
'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', | |
'premiumProduct1', 'premiumProduct2', 'premiumProduct3', | |
] | |
def coupons_col_filter(df, coupons=1): | |
"""There is some features that shared among coupon classifiers. | |
That features is: | |
- diffPrice | |
- log_sum_price | |
The purpose is to indicate comparison between coupon. | |
""" | |
general = [ | |
'prior_mean_basketValue', 'prior_std_basketValue', | |
'prior_using_coupon_rate', 'pressure', | |
'prior_couponID1_population', 'prior_couponID2_population', 'prior_couponID3_population', | |
] | |
coupons1 = ['premiumProduct1', 'diffPrice1_abs', 'reward1', 'prior_couponID1_used', 'prior_couponID1_used_rate'] | |
coupons2 = ['premiumProduct2', 'diffPrice2_abs', 'reward2', 'prior_couponID2_used', 'prior_couponID2_used_rate'] | |
coupons3 = ['premiumProduct3', 'diffPrice3_abs', 'reward3', 'prior_couponID3_used', 'prior_couponID3_used_rate'] | |
cols = df.columns.tolist() | |
cats1 = [x for x in cols if x.startswith('categoryIDs1')] | |
cats2 = [x for x in cols if x.startswith('categoryIDs2')] | |
cats3 = [x for x in cols if x.startswith('categoryIDs3')] | |
brands1 = [x for x in cols if x.startswith('brand1')] | |
brands2 = [x for x in cols if x.startswith('brand2')] | |
brands3 = [x for x in cols if x.startswith('brand3')] | |
coupons1 += cats1 + brands1 | |
coupons2 += cats2 + brands2 | |
coupons3 += cats3 + brands3 | |
if coupons == 1: | |
return general + coupons2 + coupons3 | |
elif coupons == 2: | |
return general + coupons1 + coupons3 | |
elif coupons == 3: | |
return general + coupons1 + coupons2 | |
else: | |
raise KeyError('bego lu!') | |
def make_coupon_features(df, coupons=1): | |
filters = coupons_col_filter(df, coupons) | |
coupons_features = df.drop(filters, axis=1) | |
X_coupons = coupons_features.values.copy() | |
X_coupons = X_coupons[:, 1:-4] | |
return X_coupons | |
def make_features(df): | |
# X for classification (coupon features) | |
X_coupons1 = make_coupon_features(df, 1) | |
X_coupons2 = make_coupon_features(df, 2) | |
X_coupons3 = make_coupon_features(df, 3) | |
# X for regression (basketValue features) | |
basket_features = df[BASKET_VALUE_FEATURES] | |
X_basket = basket_features.values.copy() | |
return X_coupons1, X_coupons2, X_coupons3, X_basket | |
def load_train_data(path=None): | |
df = pd.read_csv('../data/train_gue.csv') | |
# y for classification (coupon used) | |
coupon_used = df[['coupon1Used', 'coupon2Used', 'coupon3Used']] | |
y_coupons = coupon_used.values.copy() | |
# y for regression (basketValue) | |
basket_value = df['basketValue'] | |
y_basket = basket_value.values.copy() | |
X_coupons1, X_coupons2, X_coupons3, X_basket = make_features(df) | |
print(" -- Data loaded.") | |
return (X_coupons1.astype(float), X_coupons2.astype(float), | |
X_coupons3.astype(float), X_basket.astype(float), | |
y_coupons.astype(int), y_basket.astype(float)) | |
def load_test_data(path=None): | |
df = pd.read_csv('../data/test_gue.csv') | |
X_coupons1, X_coupons2, X_coupons3, X_basket = make_features(df) | |
return (X_coupons1.astype(float), X_coupons2.astype(float), | |
X_coupons3.astype(float), X_basket.astype(float)) | |
def create_coupons_model(): | |
clf = LogisticRegression(penalty='l1') | |
return clf | |
def create_regression_model(): | |
# clf = ElasticNet(alpha=0.1, l1_ratio=0.01) | |
# clf = Ridge(alpha=0.1) | |
clf = LinearRegression() | |
return clf | |
def dump_ensemble(i, ids, y_preds): | |
lbls = ['coupon1Used', 'coupon2Used', 'coupon3Used', 'basketValue'] | |
preds = pd.DataFrame(y_preds, index=ids, columns=lbls) | |
preds.to_csv("../ensemble/bahrun/bahrun_{}.csv".format(i), index_label='orderID') | |
def validation(): | |
"""Local Cross Validation. | |
""" | |
X_coupons1, X_coupons2, X_coupons3, X_basket, y_coupons, y_basket = load_train_data() | |
klassif = create_coupons_model() | |
regress = create_regression_model() | |
print(" --- Start local evaluation.") | |
# kf = cross_validation.KFold(X_coupons.shape[0], n_folds=10) | |
# ss = cross_validation.ShuffleSplit(X_coupons.shape[0], n_iter=100, train_size=0.9) | |
tsv = time_series_validation(X_basket.shape[0], 33) | |
i=1; scores = [] | |
for train, test in tsv: | |
size = test.shape[0] | |
# 1. | |
klassif.fit(X_coupons1[train], y_coupons[train, 0]) | |
y_coupons1_preds = klassif.predict_proba(X_coupons1[test])[:,1] | |
y_coupons1_preds = y_coupons1_preds.reshape(size, 1) | |
# 2. | |
klassif.fit(X_coupons2[train], y_coupons[train, 1]) | |
y_coupons2_preds = klassif.predict_proba(X_coupons2[test])[:,1] | |
y_coupons2_preds = y_coupons2_preds.reshape(size, 1) | |
# 3. | |
klassif.fit(X_coupons3[train], y_coupons[train, 2]) | |
y_coupons3_preds = klassif.predict_proba(X_coupons3[test])[:,1] | |
y_coupons3_preds = y_coupons3_preds.reshape(size, 1) | |
y_coupons_preds = np.hstack((y_coupons1_preds, y_coupons2_preds, y_coupons3_preds)) | |
regress.fit(X_basket[train], np.log(y_basket[train])) | |
y_basket_preds = np.exp(regress.predict(X_basket[test])) | |
y_basket_preds = y_basket_preds.reshape(size, 1) | |
y_preds = np.hstack((y_coupons_preds, y_basket_preds)) | |
y_coupons_true = y_coupons[test] | |
y_basket_true = y_basket[test].reshape(X_basket[test].shape[0], 1) | |
y_true = np.hstack((y_coupons_true, y_basket_true)) | |
score = dmc_loss(y_true, y_preds) | |
ids = test + 1 | |
dump_ensemble(i, ids, y_preds) | |
print(" ---- Score of #{0} : {1:.5f}".format(i, score)) | |
i += 1 | |
scores.append(score) | |
print(" --- Finished local evaluation.") | |
quantiles = stats.mstats.mquantiles(scores) | |
print(" --- Score Results:") | |
print(" - min: {:.5f}".format(np.min(scores))) | |
print(" - 25%: {:.5f}".format(quantiles[0])) | |
print(" - median: {:.5f}".format(np.median(scores))) | |
print(" - 75%: {:.5f}".format(quantiles[2])) | |
print(" - max: {:.5f}".format(np.max(scores))) | |
print(" - mean: {0:.5f} (+/-{1:.5f})".format(np.mean(scores), stats.sem(scores))) | |
def train(): | |
X_coupons1, X_coupons2, X_coupons3, X_basket, y_coupons, y_basket = load_train_data() | |
klassif1 = create_coupons_model() | |
klassif2 = create_coupons_model() | |
klassif3 = create_coupons_model() | |
regress = create_regression_model() | |
print(" -- Start training.") | |
klassif1.fit(X_coupons1, y_coupons[:,0]) | |
klassif2.fit(X_coupons2, y_coupons[:,1]) | |
klassif3.fit(X_coupons3, y_coupons[:,2]) | |
regress.fit(X_basket, np.log(y_basket)) | |
print(" -- Finished training.") | |
return klassif1, klassif2, klassif3, regress | |
def make_submission(classifiers, path='../bahrun_submission.csv'): | |
path = sys.argv[1] if len(sys.argv) > 1 else path | |
klassif1, klassif2, klassif3, regress = classifiers | |
X_coupons1, X_coupons2, X_coupons3, X_basket = load_test_data() | |
size = X_basket.shape[0] | |
# coupon prediction (classification) | |
# 1. | |
y_coupons1_preds = klassif1.predict_proba(X_coupons1)[:,1] | |
y_coupons1_preds = y_coupons1_preds.reshape(size, 1) | |
# 2. | |
y_coupons2_preds = klassif2.predict_proba(X_coupons2)[:,1] | |
y_coupons2_preds = y_coupons2_preds.reshape(size, 1) | |
# 3. | |
y_coupons3_preds = klassif3.predict_proba(X_coupons3)[:,1] | |
y_coupons3_preds = y_coupons3_preds.reshape(size, 1) | |
y_coupons_preds = np.hstack((y_coupons1_preds, y_coupons2_preds, y_coupons3_preds)) | |
# basket prediction (regression) | |
y_basket_preds = np.exp(regress.predict(X_basket)) | |
y_basket_preds = y_basket_preds.reshape(size, 1) | |
y_preds = np.hstack((y_coupons_preds, y_basket_preds)) | |
subfmt = pd.read_csv('../data/submission_format.csv') | |
preds = pd.DataFrame(y_preds, index=subfmt.orderID.values, | |
columns=subfmt.columns[1:]) | |
# preds.to_csv(path, index_label='orderID', sep="|") | |
preds.to_csv(path, index_label='orderID') | |
print(" -- Wrote submission to file {}.".format(path)) | |
def main(): | |
print(" - Start.") | |
# validation() | |
classifiers = train() | |
make_submission(classifiers) | |
print(" - Finished.") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment