Skip to content

Instantly share code, notes, and snippets.

@smly
Created September 5, 2019 13:51
Show Gist options
  • Save smly/367c53e855cdaeea35736f32876b7416 to your computer and use it in GitHub Desktop.
Save smly/367c53e855cdaeea35736f32876b7416 to your computer and use it in GitHub Desktop.
OptunaLGBM
# Python script using data from DonorsChoose.org Application Screening
# Forked from https://www.kaggle.com/opanichev/lightgbm-and-tf-idf-starter
# Original version: Validation score: 0.7791025740062782
# - Private Score 0.78470
# - Public Score 0.79516
# Tuned version: Validation score: 0.7799395759019241
# - Private Score 0.78622
# - Public Score 0.79535
import gc
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import lightgbm as lgb_original
import optuna.integration.lightgbm as lgb
# Extract features
def extract_features(df):
cols = [
'project_title',
'project_essay_1',
'project_essay_2',
'project_essay_3',
'project_essay_4',
'project_resource_summary',
]
for col in cols:
df[f'{col}_len'] = df[col].apply(lambda x: len(str(x)))
df[f'{col}_wc'] = df[col].apply(lambda x: len(str(x).split(' ')))
def process_timestamp(df):
df['year'] = df['project_submitted_datetime'].apply(
lambda x: int(x.split('-')[0]))
df['month'] = df['project_submitted_datetime'].apply(
lambda x: int(x.split('-')[1]))
df['date'] = df['project_submitted_datetime'].apply(
lambda x: int(x.split(' ')[0].split('-')[2]))
df['day_of_week'] = pd.to_datetime(
df['project_submitted_datetime']).dt.weekday
df['hour'] = df['project_submitted_datetime'].apply(
lambda x: int(x.split(' ')[-1].split(':')[0]))
df['minute'] = df['project_submitted_datetime'].apply(
lambda x: int(x.split(' ')[-1].split(':')[1]))
df['project_submitted_datetime'] = pd.to_datetime(
df['project_submitted_datetime']).values.astype(np.int64)
def load_features():
# Load Data
dtype = {
'id': str,
'teacher_id': str,
'teacher_prefix': str,
'school_state': str,
'project_submitted_datetime': str,
'project_grade_category': str,
'project_subject_categories': str,
'project_subject_subcategories': str,
'project_title': str,
'project_essay_1': str,
'project_essay_2': str,
'project_essay_3': str,
'project_essay_4': str,
'project_resource_summary': str,
'teacher_number_of_previously_posted_projects': int,
'project_is_approved': np.uint8,
}
train = pd.read_csv('train.csv', dtype=dtype, low_memory=True)
test = pd.read_csv('test.csv', dtype=dtype, low_memory=True)
res = pd.read_csv('resources.csv')
# Preprocess data
train['project_essay'] = train.apply(lambda row: ' '.join([
str(row['project_essay_1']),
str(row['project_essay_2']),
str(row['project_essay_3']),
str(row['project_essay_4']),
]), axis=1)
test['project_essay'] = test.apply(lambda row: ' '.join([
str(row['project_essay_1']),
str(row['project_essay_2']),
str(row['project_essay_3']),
str(row['project_essay_4']),
]), axis=1)
extract_features(train)
extract_features(test)
drop_cols = [
'project_essay_1',
'project_essay_2',
'project_essay_3',
'project_essay_4',
]
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)
df_all = pd.concat([train, test], axis=0, sort=False)
gc.collect()
# Merge with resources
res = pd.DataFrame(res[['id', 'quantity', 'price']].groupby('id').agg({
'quantity': ['sum', 'min', 'max', 'mean', 'std'],
'price': [
'count', 'sum', 'min', 'max', 'mean', 'std',
lambda x: len(np.unique(x)),
]}
)).reset_index()
res.columns = ['_'.join(col) for col in res.columns]
res.rename(columns={'id_': 'id'}, inplace=True)
res['mean_price'] = res['price_sum']/res['quantity_sum']
# res['price_max_to_price_min'] = res['price_max']/res['price_min']
# res['quantity_max_to_quantity_min'] = res['quantity_max']/res['quantity_min']
train = train.merge(res, on='id', how='left')
test = test.merge(res, on='id', how='left')
del res
gc.collect()
# Preprocess columns with label encoder
print('Label Encoder...')
cols = [
'teacher_id',
'teacher_prefix',
'school_state',
'project_grade_category',
'project_subject_categories',
'project_subject_subcategories'
]
for c in tqdm(cols):
le = LabelEncoder()
le.fit(df_all[c].astype(str))
train[c] = le.transform(train[c].astype(str))
test[c] = le.transform(test[c].astype(str))
del le
gc.collect()
print('Done.')
# Preprocess timestamp
print('Preprocessing timestamp...')
process_timestamp(train)
process_timestamp(test)
print('Done.')
# Preprocess text
print('Preprocessing text...')
cols_to_vectorize = [
'project_title',
'project_essay',
'project_resource_summary'
]
n_features = [400, 4040, 400]
for c_i, c in tqdm(enumerate(cols_to_vectorize)):
tfidf = TfidfVectorizer(
max_features=n_features[c_i],
norm='l2',
)
tfidf.fit(df_all[c])
tfidf_train = np.array(tfidf.transform(train[c]).toarray(), dtype=np.float16)
tfidf_test = np.array(tfidf.transform(test[c]).toarray(), dtype=np.float16)
for i in range(n_features[c_i]):
train[c + '_tfidf_' + str(i)] = tfidf_train[:, i]
test[c + '_tfidf_' + str(i)] = tfidf_test[:, i]
del tfidf, tfidf_train, tfidf_test
gc.collect()
print('Done.')
del df_all
gc.collect()
# Prepare data
cols_to_drop = [
'id',
'teacher_id',
'project_title',
'project_essay',
'project_resource_summary',
'project_is_approved',
]
X = train.drop(cols_to_drop, axis=1, errors='ignore')
y = train['project_is_approved']
X_test = test.drop(cols_to_drop, axis=1, errors='ignore')
id_test = test['id'].values
feature_names = list(X.columns)
print(X.shape, X_test.shape)
del train, test
gc.collect()
return X, y, X_test, id_test, feature_names
def main():
X, y, X_test, id_test, feature_names = load_features()
# Build the model
use_first_fold_only = True
n_splits = 5
n_repeats = 1
p_buf = []
kf = RepeatedKFold(
n_splits=n_splits,
n_repeats=n_repeats,
random_state=0)
auc_buf = []
for fold_idx, (train_index, valid_index) in enumerate(kf.split(X)):
print('Fold {}/{}'.format(fold_idx + 1, n_splits))
params = {
'objective': 'binary',
'metric': 'auc',
}
lgb_train = lgb.Dataset(X.loc[train_index],
y.loc[train_index],
feature_name=feature_names)
lgb_valid = lgb.Dataset(X.loc[valid_index],
y.loc[valid_index])
best_params = {}
tuning_history = []
lgb.train(
params,
lgb_train,
num_boost_round=10000,
valid_sets=[lgb_train, lgb_valid],
early_stopping_rounds=100,
best_params=best_params,
tuning_history=tuning_history)
pd.DataFrame(tuning_history).to_csv('./tuning_history.csv')
print('Best parameters: ' + json.dumps(best_params, indent=4))
best_params['learning_rate'] = 0.05
model = lgb_original.train(
best_params,
lgb_train,
num_boost_round=20000,
valid_sets=[lgb_train, lgb_valid],
early_stopping_rounds=1000,
verbose_eval=1000)
p = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)
auc = roc_auc_score(y.loc[valid_index], p)
print('{} AUC: {}'.format(fold_idx, auc))
p = model.predict(X_test, num_iteration=model.best_iteration)
if len(p_buf) == 0:
p_buf = np.array(p, dtype=np.float16)
else:
p_buf += np.array(p, dtype=np.float16)
auc_buf.append(auc)
# Comment this to run several folds
if use_first_fold_only:
break
del model, lgb_train, lgb_valid, p
gc.collect
auc_mean = np.mean(auc_buf)
auc_std = np.std(auc_buf)
print('AUC = {:.6f} +/- {:.6f}'.format(auc_mean, auc_std))
preds = p_buf / (fold_idx + 1)
# Prepare submission
subm = pd.DataFrame()
subm['id'] = id_test
subm['project_is_approved'] = preds
subm.to_csv('submission.csv', index=False)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment