Skip to content

Instantly share code, notes, and snippets.

@bahrunnur
Last active April 14, 2016 10:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save bahrunnur/8963237d721366baaab1 to your computer and use it in GitHub Desktop.
Save bahrunnur/8963237d721366baaab1 to your computer and use it in GitHub Desktop.
My solution for DMC 2015
from __future__ import division
import time
import numpy as np
import pandas as pd
def sigmoid(x):
"""Sigmoid Function."""
return 1.0 / (1.0 + np.exp(-x))
def softmax(col):
"""Normalizing column with softmax function.
This function can be used to convert numerical features to [0,1] interval.
"""
val = col.values
val = (val - np.mean(val)) / np.std(val)
return sigmoid(val)
def deal_with_numeric(df):
"""Deal with numeric field.
Normalizing value with Log Normalization.
Why Log Normalization?
======================
Log Normalization perform better than softmax because this numerical column
has positive skew distribution. Applying log will penalize large value. So
it could make variance lower.
Source: http://stats.stackexchange.com/q/298
"""
numeric = [ 'price1', 'basePrice1', 'reward1',
'price2', 'basePrice2', 'reward2',
'price3', 'basePrice3', 'reward3', ]
df['log_sum_price'] = np.log(df.price1 + df.price2 + df.price3 + 1)
print(" --- Dealing with numeric. Log Transform.")
for col in numeric:
df[col] = np.log(df[col] + 1)
print(" --- Dealing with numeric. Calculate diff price.")
# this is for coupons features
df['diffPrice1_abs'] = np.abs(df['price1'] - df['basePrice1'])
df['diffPrice2_abs'] = np.abs(df['price2'] - df['basePrice2'])
df['diffPrice3_abs'] = np.abs(df['price3'] - df['basePrice3'])
# and using aggregated for comparation
df['diffPrice1'] = df['price1'] - df['basePrice1']
df['diffPrice2'] = df['price2'] - df['basePrice2']
df['diffPrice3'] = df['price3'] - df['basePrice3']
df['diffPrice1'] = df['diffPrice1'] + np.abs(df['diffPrice1'].min())
df['diffPrice2'] = df['diffPrice2'] + np.abs(df['diffPrice2'].min())
df['diffPrice3'] = df['diffPrice3'] + np.abs(df['diffPrice3'].min())
df['diffPrice'] = df['diffPrice1'] + df['diffPrice2'] + df['diffPrice3']
useless = [ 'price1', 'basePrice1', 'diffPrice1',
'price2', 'basePrice2', 'diffPrice2',
'price3', 'basePrice3', 'diffPrice3', ]
df = df.drop(useless, axis=1)
print(" --- Finished dealing with numeric.")
return df
def prior_postprocess(df):
df['prior_using_coupon_rate'] = df['prior_using_coupon'] / df['prior_transactions']
df['prior_using_coupon_rate'] = df['prior_using_coupon_rate'].replace(np.inf, 0)
df['prior_using_coupon_rate'] = df['prior_using_coupon_rate'].replace(np.NaN, 0)
df['prior_couponID1_used_rate'] = df['prior_couponID1_used'] / df['prior_couponID1_population']
df['prior_couponID1_used_rate'] = df['prior_couponID1_used_rate'].replace(np.inf, 0)
df['prior_couponID1_used_rate'] = df['prior_couponID1_used_rate'].replace(np.NaN, 0)
df['prior_couponID2_used_rate'] = df['prior_couponID2_used'] / df['prior_couponID2_population']
df['prior_couponID2_used_rate'] = df['prior_couponID2_used_rate'].replace(np.inf, 0)
df['prior_couponID2_used_rate'] = df['prior_couponID2_used_rate'].replace(np.NaN, 0)
df['prior_couponID3_used_rate'] = df['prior_couponID3_used'] / df['prior_couponID3_population']
df['prior_couponID3_used_rate'] = df['prior_couponID3_used_rate'].replace(np.inf, 0)
df['prior_couponID3_used_rate'] = df['prior_couponID3_used_rate'].replace(np.NaN, 0)
# df['willing_to_use_coupon1'] = (df['prior_using_coupon_rate'] + df['prior_couponID1_used_rate']) - (df['prior_using_coupon_rate'] * df['prior_couponID1_used_rate'])
# df['willing_to_use_coupon2'] = (df['prior_using_coupon_rate'] + df['prior_couponID2_used_rate']) - (df['prior_using_coupon_rate'] * df['prior_couponID2_used_rate'])
# df['willing_to_use_coupon3'] = (df['prior_using_coupon_rate'] + df['prior_couponID3_used_rate']) - (df['prior_using_coupon_rate'] * df['prior_couponID3_used_rate'])
df['prior_mean_basketValue'] = df['prior_mean_basketValue'].replace(np.NaN, 0)
df['prior_mean_basketValue'] = np.log(df['prior_mean_basketValue'] + 1)
df['prior_std_basketValue'] = df['prior_std_basketValue'].replace(np.NaN, 0)
df['prior_std_basketValue'] = np.log(df['prior_std_basketValue'] + 1)
return df
def prior_analysis(df, silent=True):
"""Prior Analysis:
2. User Prior Transactions
3. Coupon ID prior used
4. Prior mean basketValue based on user
"""
df['totalCouponUsed'] = df['coupon1Used'] + df['coupon2Used'] + df['coupon3Used']
for idx, row in df.iterrows():
prior = df[(df.orderTime < row.orderTime)]
# 2.
pt = prior[(prior.userID == row.userID)]
pcu = pt[pt.totalCouponUsed > 0]
df.loc[idx, 'prior_transactions'] = pt.shape[0]
df.loc[idx, 'prior_using_coupon'] = pcu.shape[0]
# 3.
pcid1_1 = prior[(prior.couponID1 == row.couponID1)]
pcid1_2 = prior[(prior.couponID2 == row.couponID1)]
pcid1_3 = prior[(prior.couponID3 == row.couponID1)]
pcid2_1 = prior[(prior.couponID1 == row.couponID2)]
pcid2_2 = prior[(prior.couponID2 == row.couponID2)]
pcid2_3 = prior[(prior.couponID3 == row.couponID2)]
pcid3_1 = prior[(prior.couponID1 == row.couponID3)]
pcid3_2 = prior[(prior.couponID2 == row.couponID3)]
pcid3_3 = prior[(prior.couponID3 == row.couponID3)]
pcid1 = pcid1_1.shape[0] + pcid1_2.shape[0] + pcid1_3.shape[0]
pcid2 = pcid2_1.shape[0] + pcid2_2.shape[0] + pcid2_3.shape[0]
pcid3 = pcid3_1.shape[0] + pcid3_2.shape[0] + pcid3_3.shape[0]
df.loc[idx, 'prior_couponID1_population'] = pcid1
df.loc[idx, 'prior_couponID2_population'] = pcid2
df.loc[idx, 'prior_couponID3_population'] = pcid3
pcid1_1_used = pcid1_1[pcid1_1.coupon1Used > 0]
pcid1_2_used = pcid1_2[pcid1_2.coupon2Used > 0]
pcid1_3_used = pcid1_3[pcid1_3.coupon3Used > 0]
pcid2_1_used = pcid2_1[pcid2_1.coupon1Used > 0]
pcid2_2_used = pcid2_2[pcid2_2.coupon2Used > 0]
pcid2_3_used = pcid2_3[pcid2_3.coupon3Used > 0]
pcid3_1_used = pcid3_1[pcid3_1.coupon1Used > 0]
pcid3_2_used = pcid3_2[pcid3_2.coupon2Used > 0]
pcid3_3_used = pcid3_3[pcid3_3.coupon3Used > 0]
pcid1_used = pcid1_1_used.shape[0] + pcid1_2_used.shape[0] + pcid1_3_used.shape[0]
pcid2_used = pcid2_1_used.shape[0] + pcid2_2_used.shape[0] + pcid2_3_used.shape[0]
pcid3_used = pcid3_1_used.shape[0] + pcid3_2_used.shape[0] + pcid3_3_used.shape[0]
df.loc[idx, 'prior_couponID1_used'] = pcid1_used
df.loc[idx, 'prior_couponID2_used'] = pcid2_used
df.loc[idx, 'prior_couponID3_used'] = pcid3_used
# 4.
pmbv = pt['basketValue'].mean()
pstdmbv = pt['basketValue'].std()
df.loc[idx, 'prior_mean_basketValue'] = pmbv
df.loc[idx, 'prior_std_basketValue'] = pstdmbv
if not silent:
print(" --- Finished processing prior at row: #{}".format(idx))
df = prior_postprocess(df)
return df
def deal_with_time(df):
"""Dealing with Time features.
Time series analysis:
1. Create delta time column. Indicating for how long the user has that
coupons. (in seconds)
6. What day is today? (Monday, Sunday, etc)
"""
print(" --- Dealing with Times.")
# 1. Create delta time column. Indicating for how long the user has that
# coupons. (in seconds)
date_format = "%Y-%m-%d %H:%M:%S"
order_time = df['orderTime'].apply(lambda x: time.mktime(time.strptime(x, date_format)))
coupons_received = df['couponsReceived'].apply(lambda x: time.mktime(time.strptime(x, date_format)))
df['coupon_delta'] = order_time - coupons_received
df['coupon_delta'] = np.log(df['coupon_delta'] + 1)
# entahlah, tapi ini punya korelasi lumayan buat ke basketValue
df['pressure'] = df['log_sum_price'] * df['coupon_delta']
# Convert to pandas datetime
df['orderTime'] = pd.to_datetime(df['orderTime'])
df['couponsReceived'] = pd.to_datetime(df['couponsReceived'])
# Prior Analysis
df = prior_analysis(df)
# 6. day of week
# df['dayofweek'] = df['orderTime'].dt.dayofweek
dayofweek = pd.get_dummies(df['orderTime'].dt.dayofweek, prefix='dayofweek')
df = pd.concat([df, dayofweek], axis=1)
# drop da bitches
df = df.drop(['orderTime', 'couponsReceived', 'totalCouponUsed'], axis=1)
print(" --- Finished dealing with Times.")
return df
def deal_with_brands(df):
"""Deal with Brands features.
Just transform it into dummy variable.
"""
brands = ['brand1', 'brand2', 'brand3']
# df[brands] = df[brands].fillna('missing')
print(" --- Dealing with Brands. Binarization.")
for col in brands:
category_binary = pd.get_dummies(df[col], prefix=col)
df = pd.concat([df, category_binary], axis=1)
df = df.drop(brands, axis=1)
print(" --- Finished dealing with Brands.")
return df
def deal_with_coupon_id(df):
"""Replacing coupon id with it's population number.
Another analysis is to use coupon used rate prior transaction time."""
print(" --- Dealing with coupon id. Drop it.")
ids = ['couponID1', 'couponID2', 'couponID3']
df = df.drop(ids, axis=1)
print(" --- Finished dealing with coupon id.")
return df
def deal_with_user_id(df):
"""User analysis"""
print(" --- Dealing with Users. Binarization.")
users = pd.get_dummies(df['userID'], prefix='userID')
df = pd.concat([df, users], axis=1)
df = df.drop('userID', axis=1)
print(" --- Finished dealing with Users.")
return df
def deal_with_product_group(df):
"""Product Group Transformation."""
print(" --- Dealing with Product Group. Dropping.")
cols = ['productGroup1', 'productGroup2', 'productGroup3' ]
df = df.drop(cols, axis=1)
print(" --- Finished dealing with Product Group.")
return df
def deal_with_category(df):
"""Deal with category field.
Doing binarization for this field. Doing other things to id based column.
"""
df = deal_with_coupon_id(df)
df = deal_with_user_id(df)
df = deal_with_product_group(df)
print(" --- Finished dealing with category.")
return df
def init(df):
"""Base trainsformation of features."""
x = df.categoryIDs1.str.get_dummies(sep=',')
y = df.categoryIDs2.str.get_dummies(sep=',')
z = df.categoryIDs3.str.get_dummies(sep=',')
xcols = 'categoryIDs1_' + x.columns.values
ycols = 'categoryIDs2_' + y.columns.values
zcols = 'categoryIDs3_' + z.columns.values
x.columns = xcols
y.columns = ycols
z.columns = zcols
indc = pd.concat([x, y, z], axis=1)
procs = df.drop(['categoryIDs1', 'categoryIDs2', 'categoryIDs3'], axis=1)
procs = pd.concat([procs, indc], axis=1)
pcols = procs.columns.tolist()
pcols = pcols[:11] + xcols.tolist() + pcols[11:18] + ycols.tolist() + pcols[18:25] + zcols.tolist() + pcols[25:29]
procs = procs[pcols]
return procs
def main():
trains = pd.read_csv('../data/processed_train.csv')
tests = pd.read_csv('../data/processed_test.csv')
# split based on df index
split = trains.shape[0]
df = pd.concat([trains, tests], axis=0, ignore_index=True)
# THE MAGIC :: >
# =========================================================================
df = deal_with_numeric(df)
df = deal_with_time(df)
df = deal_with_brands(df)
df = deal_with_category(df)
# bring back y to the last column
y = ['coupon1Used', 'coupon2Used', 'coupon3Used', 'basketValue']
targets = df[y]
df = df.drop(y, axis=1)
df = pd.concat([df, targets], axis=1)
proc_trains = df.iloc[:split]
proc_tests = df.iloc[split:]
proc_trains.to_csv('../data/train_gue.csv', index=False)
proc_tests.to_csv('../data/test_gue.csv', index=False)
if __name__ == "__main__":
main()
"""Script of my solution to DMC 2015
Use this script in the following way:
python solution.py <name-of-submission>
Argument is optional, the script will assign default name.
"""
from __future__ import division
import sys
import pdb
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.externals import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, ElasticNet, Ridge, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import multiclass
from sklearn import cross_validation
from sklearn import grid_search
from scipy.stats.stats import pearsonr
from XGBoostClassifier import XGBoostClassifier
np.random.seed(17411)
def dmc_loss(y_true, y_pred, eps=1e-15):
"""DMC 2015 loss function.
y = coupon1, coupon2, coupon3, basketValue
# Ex:
>>> y_true = np.array([[ 1. , 1. , 0. , 187. ],
[ 0. , 1. , 0. , 132.5],
[ 1. , 0. , 0. , 83.4],
[ 0. , 0. , 1. , 50.3]])
>>> y_pred = y_true.copy()
>>> loss = dmc_loss(y_true, y_pred)
4.7603819928347129
`4.7603819928347129` is the zero error value. Manage to get score as close
as this value.
Parameters
----------
y_true : array, shape = [n_samples, 4]
y_pred : array, shape = [n_samples, 4]
Returns
-------
loss : float
"""
# predictions = np.clip(y_pred, eps, 1 - eps)
predictions = y_pred
vsota = np.square(np.abs(y_true - predictions) / np.mean(y_true, axis=0))
loss = np.sum(vsota)
return loss
def time_series_validation(n, step):
"""Cross validation for time-series data."""
leap = 100; k = 2200; h = 650
indices = np.arange(n)
for i in range(step):
yield indices[:k], indices[k:k+h]
k += leap
BASKET_VALUE_FEATURES = [
'pressure', 'prior_mean_basketValue', 'prior_std_basketValue',
'prior_using_coupon_rate',
'dayofweek_0', 'dayofweek_1', 'dayofweek_2',
'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6',
'premiumProduct1', 'premiumProduct2', 'premiumProduct3',
]
def coupons_col_filter(df, coupons=1):
"""There is some features that shared among coupon classifiers.
That features is:
- diffPrice
- log_sum_price
The purpose is to indicate comparison between coupon.
"""
general = [
'prior_mean_basketValue', 'prior_std_basketValue',
'prior_using_coupon_rate', 'pressure',
'prior_couponID1_population', 'prior_couponID2_population', 'prior_couponID3_population',
]
coupons1 = ['premiumProduct1', 'diffPrice1_abs', 'reward1', 'prior_couponID1_used', 'prior_couponID1_used_rate']
coupons2 = ['premiumProduct2', 'diffPrice2_abs', 'reward2', 'prior_couponID2_used', 'prior_couponID2_used_rate']
coupons3 = ['premiumProduct3', 'diffPrice3_abs', 'reward3', 'prior_couponID3_used', 'prior_couponID3_used_rate']
cols = df.columns.tolist()
cats1 = [x for x in cols if x.startswith('categoryIDs1')]
cats2 = [x for x in cols if x.startswith('categoryIDs2')]
cats3 = [x for x in cols if x.startswith('categoryIDs3')]
brands1 = [x for x in cols if x.startswith('brand1')]
brands2 = [x for x in cols if x.startswith('brand2')]
brands3 = [x for x in cols if x.startswith('brand3')]
coupons1 += cats1 + brands1
coupons2 += cats2 + brands2
coupons3 += cats3 + brands3
if coupons == 1:
return general + coupons2 + coupons3
elif coupons == 2:
return general + coupons1 + coupons3
elif coupons == 3:
return general + coupons1 + coupons2
else:
raise KeyError('bego lu!')
def make_coupon_features(df, coupons=1):
filters = coupons_col_filter(df, coupons)
coupons_features = df.drop(filters, axis=1)
X_coupons = coupons_features.values.copy()
X_coupons = X_coupons[:, 1:-4]
return X_coupons
def make_features(df):
# X for classification (coupon features)
X_coupons1 = make_coupon_features(df, 1)
X_coupons2 = make_coupon_features(df, 2)
X_coupons3 = make_coupon_features(df, 3)
# X for regression (basketValue features)
basket_features = df[BASKET_VALUE_FEATURES]
X_basket = basket_features.values.copy()
return X_coupons1, X_coupons2, X_coupons3, X_basket
def load_train_data(path=None):
df = pd.read_csv('../data/train_gue.csv')
# y for classification (coupon used)
coupon_used = df[['coupon1Used', 'coupon2Used', 'coupon3Used']]
y_coupons = coupon_used.values.copy()
# y for regression (basketValue)
basket_value = df['basketValue']
y_basket = basket_value.values.copy()
X_coupons1, X_coupons2, X_coupons3, X_basket = make_features(df)
print(" -- Data loaded.")
return (X_coupons1.astype(float), X_coupons2.astype(float),
X_coupons3.astype(float), X_basket.astype(float),
y_coupons.astype(int), y_basket.astype(float))
def load_test_data(path=None):
df = pd.read_csv('../data/test_gue.csv')
X_coupons1, X_coupons2, X_coupons3, X_basket = make_features(df)
return (X_coupons1.astype(float), X_coupons2.astype(float),
X_coupons3.astype(float), X_basket.astype(float))
def create_coupons_model():
clf = LogisticRegression(penalty='l1')
return clf
def create_regression_model():
# clf = ElasticNet(alpha=0.1, l1_ratio=0.01)
# clf = Ridge(alpha=0.1)
clf = LinearRegression()
return clf
def dump_ensemble(i, ids, y_preds):
lbls = ['coupon1Used', 'coupon2Used', 'coupon3Used', 'basketValue']
preds = pd.DataFrame(y_preds, index=ids, columns=lbls)
preds.to_csv("../ensemble/bahrun/bahrun_{}.csv".format(i), index_label='orderID')
def validation():
"""Local Cross Validation.
"""
X_coupons1, X_coupons2, X_coupons3, X_basket, y_coupons, y_basket = load_train_data()
klassif = create_coupons_model()
regress = create_regression_model()
print(" --- Start local evaluation.")
# kf = cross_validation.KFold(X_coupons.shape[0], n_folds=10)
# ss = cross_validation.ShuffleSplit(X_coupons.shape[0], n_iter=100, train_size=0.9)
tsv = time_series_validation(X_basket.shape[0], 33)
i=1; scores = []
for train, test in tsv:
size = test.shape[0]
# 1.
klassif.fit(X_coupons1[train], y_coupons[train, 0])
y_coupons1_preds = klassif.predict_proba(X_coupons1[test])[:,1]
y_coupons1_preds = y_coupons1_preds.reshape(size, 1)
# 2.
klassif.fit(X_coupons2[train], y_coupons[train, 1])
y_coupons2_preds = klassif.predict_proba(X_coupons2[test])[:,1]
y_coupons2_preds = y_coupons2_preds.reshape(size, 1)
# 3.
klassif.fit(X_coupons3[train], y_coupons[train, 2])
y_coupons3_preds = klassif.predict_proba(X_coupons3[test])[:,1]
y_coupons3_preds = y_coupons3_preds.reshape(size, 1)
y_coupons_preds = np.hstack((y_coupons1_preds, y_coupons2_preds, y_coupons3_preds))
regress.fit(X_basket[train], np.log(y_basket[train]))
y_basket_preds = np.exp(regress.predict(X_basket[test]))
y_basket_preds = y_basket_preds.reshape(size, 1)
y_preds = np.hstack((y_coupons_preds, y_basket_preds))
y_coupons_true = y_coupons[test]
y_basket_true = y_basket[test].reshape(X_basket[test].shape[0], 1)
y_true = np.hstack((y_coupons_true, y_basket_true))
score = dmc_loss(y_true, y_preds)
ids = test + 1
dump_ensemble(i, ids, y_preds)
print(" ---- Score of #{0} : {1:.5f}".format(i, score))
i += 1
scores.append(score)
print(" --- Finished local evaluation.")
quantiles = stats.mstats.mquantiles(scores)
print(" --- Score Results:")
print(" - min: {:.5f}".format(np.min(scores)))
print(" - 25%: {:.5f}".format(quantiles[0]))
print(" - median: {:.5f}".format(np.median(scores)))
print(" - 75%: {:.5f}".format(quantiles[2]))
print(" - max: {:.5f}".format(np.max(scores)))
print(" - mean: {0:.5f} (+/-{1:.5f})".format(np.mean(scores), stats.sem(scores)))
def train():
X_coupons1, X_coupons2, X_coupons3, X_basket, y_coupons, y_basket = load_train_data()
klassif1 = create_coupons_model()
klassif2 = create_coupons_model()
klassif3 = create_coupons_model()
regress = create_regression_model()
print(" -- Start training.")
klassif1.fit(X_coupons1, y_coupons[:,0])
klassif2.fit(X_coupons2, y_coupons[:,1])
klassif3.fit(X_coupons3, y_coupons[:,2])
regress.fit(X_basket, np.log(y_basket))
print(" -- Finished training.")
return klassif1, klassif2, klassif3, regress
def make_submission(classifiers, path='../bahrun_submission.csv'):
path = sys.argv[1] if len(sys.argv) > 1 else path
klassif1, klassif2, klassif3, regress = classifiers
X_coupons1, X_coupons2, X_coupons3, X_basket = load_test_data()
size = X_basket.shape[0]
# coupon prediction (classification)
# 1.
y_coupons1_preds = klassif1.predict_proba(X_coupons1)[:,1]
y_coupons1_preds = y_coupons1_preds.reshape(size, 1)
# 2.
y_coupons2_preds = klassif2.predict_proba(X_coupons2)[:,1]
y_coupons2_preds = y_coupons2_preds.reshape(size, 1)
# 3.
y_coupons3_preds = klassif3.predict_proba(X_coupons3)[:,1]
y_coupons3_preds = y_coupons3_preds.reshape(size, 1)
y_coupons_preds = np.hstack((y_coupons1_preds, y_coupons2_preds, y_coupons3_preds))
# basket prediction (regression)
y_basket_preds = np.exp(regress.predict(X_basket))
y_basket_preds = y_basket_preds.reshape(size, 1)
y_preds = np.hstack((y_coupons_preds, y_basket_preds))
subfmt = pd.read_csv('../data/submission_format.csv')
preds = pd.DataFrame(y_preds, index=subfmt.orderID.values,
columns=subfmt.columns[1:])
# preds.to_csv(path, index_label='orderID', sep="|")
preds.to_csv(path, index_label='orderID')
print(" -- Wrote submission to file {}.".format(path))
def main():
print(" - Start.")
# validation()
classifiers = train()
make_submission(classifiers)
print(" - Finished.")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment