smly/example_donorschoose.py

## example_donorschoose.py
# Python script using data from DonorsChoose.org Application Screening
# Forked from https://www.kaggle.com/opanichev/lightgbm-and-tf-idf-starter

# Original version: Validation score: 0.7791025740062782
# - Private Score 0.78470
# - Public Score 0.79516
# Tuned version: Validation score: 0.7799395759019241
# - Private Score 0.78622
# - Public Score 0.79535

import gc
import json

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import lightgbm as lgb_original
import optuna.integration.lightgbm as lgb


# Extract features
def extract_features(df):
    cols = [
        'project_title',
        'project_essay_1',
        'project_essay_2',
        'project_essay_3',
        'project_essay_4',
        'project_resource_summary',
    ]
    for col in cols:
        df[f'{col}_len'] = df[col].apply(lambda x: len(str(x)))
        df[f'{col}_wc'] = df[col].apply(lambda x: len(str(x).split(' ')))


def process_timestamp(df):
    df['year'] = df['project_submitted_datetime'].apply(
        lambda x: int(x.split('-')[0]))
    df['month'] = df['project_submitted_datetime'].apply(
        lambda x: int(x.split('-')[1]))
    df['date'] = df['project_submitted_datetime'].apply(
        lambda x: int(x.split(' ')[0].split('-')[2]))
    df['day_of_week'] = pd.to_datetime(
        df['project_submitted_datetime']).dt.weekday
    df['hour'] = df['project_submitted_datetime'].apply(
        lambda x: int(x.split(' ')[-1].split(':')[0]))
    df['minute'] = df['project_submitted_datetime'].apply(
        lambda x: int(x.split(' ')[-1].split(':')[1]))
    df['project_submitted_datetime'] = pd.to_datetime(
        df['project_submitted_datetime']).values.astype(np.int64)


def load_features():
    # Load Data
    dtype = {
        'id': str,
        'teacher_id': str,
        'teacher_prefix': str,
        'school_state': str,
        'project_submitted_datetime': str,
        'project_grade_category': str,
        'project_subject_categories': str,
        'project_subject_subcategories': str,
        'project_title': str,
        'project_essay_1': str,
        'project_essay_2': str,
        'project_essay_3': str,
        'project_essay_4': str,
        'project_resource_summary': str,
        'teacher_number_of_previously_posted_projects': int,
        'project_is_approved': np.uint8,
    }
    train = pd.read_csv('train.csv', dtype=dtype, low_memory=True)
    test = pd.read_csv('test.csv', dtype=dtype, low_memory=True)
    res = pd.read_csv('resources.csv')

    # Preprocess data
    train['project_essay'] = train.apply(lambda row: ' '.join([
        str(row['project_essay_1']),
        str(row['project_essay_2']),
        str(row['project_essay_3']),
        str(row['project_essay_4']),
        ]), axis=1)
    test['project_essay'] = test.apply(lambda row: ' '.join([
        str(row['project_essay_1']),
        str(row['project_essay_2']),
        str(row['project_essay_3']),
        str(row['project_essay_4']),
        ]), axis=1)

    extract_features(train)
    extract_features(test)

    drop_cols = [
        'project_essay_1',
        'project_essay_2',
        'project_essay_3',
        'project_essay_4',
    ]
    train.drop(drop_cols, axis=1, inplace=True)
    test.drop(drop_cols, axis=1, inplace=True)
    df_all = pd.concat([train, test], axis=0, sort=False)
    gc.collect()

    # Merge with resources
    res = pd.DataFrame(res[['id', 'quantity', 'price']].groupby('id').agg({
        'quantity': ['sum', 'min', 'max', 'mean', 'std'],
        'price': [
            'count', 'sum', 'min', 'max', 'mean', 'std',
            lambda x: len(np.unique(x)),
        ]}
    )).reset_index()
    res.columns = ['_'.join(col) for col in res.columns]
    res.rename(columns={'id_': 'id'}, inplace=True)
    res['mean_price'] = res['price_sum']/res['quantity_sum']
    # res['price_max_to_price_min'] = res['price_max']/res['price_min']
    # res['quantity_max_to_quantity_min'] = res['quantity_max']/res['quantity_min']

    train = train.merge(res, on='id', how='left')
    test = test.merge(res, on='id', how='left')
    del res
    gc.collect()

    # Preprocess columns with label encoder
    print('Label Encoder...')
    cols = [
        'teacher_id',
        'teacher_prefix',
        'school_state',
        'project_grade_category',
        'project_subject_categories',
        'project_subject_subcategories'
    ]

    for c in tqdm(cols):
        le = LabelEncoder()
        le.fit(df_all[c].astype(str))
        train[c] = le.transform(train[c].astype(str))
        test[c] = le.transform(test[c].astype(str))
    del le
    gc.collect()
    print('Done.')

    # Preprocess timestamp
    print('Preprocessing timestamp...')
    process_timestamp(train)
    process_timestamp(test)
    print('Done.')

    # Preprocess text
    print('Preprocessing text...')
    cols_to_vectorize = [
        'project_title',
        'project_essay',
        'project_resource_summary'
    ]
    n_features = [400, 4040, 400]

    for c_i, c in tqdm(enumerate(cols_to_vectorize)):
        tfidf = TfidfVectorizer(
            max_features=n_features[c_i],
            norm='l2',
            )
        tfidf.fit(df_all[c])
        tfidf_train = np.array(tfidf.transform(train[c]).toarray(), dtype=np.float16)
        tfidf_test = np.array(tfidf.transform(test[c]).toarray(), dtype=np.float16)

        for i in range(n_features[c_i]):
            train[c + '_tfidf_' + str(i)] = tfidf_train[:, i]
            test[c + '_tfidf_' + str(i)] = tfidf_test[:, i]

        del tfidf, tfidf_train, tfidf_test
        gc.collect()

    print('Done.')
    del df_all
    gc.collect()

    # Prepare data
    cols_to_drop = [
        'id',
        'teacher_id',
        'project_title',
        'project_essay',
        'project_resource_summary',
        'project_is_approved',
    ]
    X = train.drop(cols_to_drop, axis=1, errors='ignore')
    y = train['project_is_approved']
    X_test = test.drop(cols_to_drop, axis=1, errors='ignore')
    id_test = test['id'].values
    feature_names = list(X.columns)
    print(X.shape, X_test.shape)

    del train, test
    gc.collect()
    return X, y, X_test, id_test, feature_names


def main():
    X, y, X_test, id_test, feature_names = load_features()

    # Build the model
    use_first_fold_only = True
    n_splits = 5
    n_repeats = 1
    p_buf = []
    kf = RepeatedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=0)
    auc_buf = []

    for fold_idx, (train_index, valid_index) in enumerate(kf.split(X)):
        print('Fold {}/{}'.format(fold_idx + 1, n_splits))

        params = {
            'objective': 'binary',
            'metric': 'auc',
        }

        lgb_train = lgb.Dataset(X.loc[train_index],
                                y.loc[train_index],
                                feature_name=feature_names)

        lgb_valid = lgb.Dataset(X.loc[valid_index],
                                y.loc[valid_index])

        best_params = {}
        tuning_history = []

        lgb.train(
            params,
            lgb_train,
            num_boost_round=10000,
            valid_sets=[lgb_train, lgb_valid],
            early_stopping_rounds=100,
            best_params=best_params,
            tuning_history=tuning_history)

        pd.DataFrame(tuning_history).to_csv('./tuning_history.csv')

        print('Best parameters: ' + json.dumps(best_params, indent=4))
        best_params['learning_rate'] = 0.05

        model = lgb_original.train(
            best_params,
            lgb_train,
            num_boost_round=20000,
            valid_sets=[lgb_train, lgb_valid],
            early_stopping_rounds=1000,
            verbose_eval=1000)

        p = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)
        auc = roc_auc_score(y.loc[valid_index], p)
        print('{} AUC: {}'.format(fold_idx, auc))

        p = model.predict(X_test, num_iteration=model.best_iteration)
        if len(p_buf) == 0:
            p_buf = np.array(p, dtype=np.float16)
        else:
            p_buf += np.array(p, dtype=np.float16)
        auc_buf.append(auc)

        # Comment this to run several folds
        if use_first_fold_only:
            break

        del model, lgb_train, lgb_valid, p
        gc.collect

    auc_mean = np.mean(auc_buf)
    auc_std = np.std(auc_buf)
    print('AUC = {:.6f} +/- {:.6f}'.format(auc_mean, auc_std))
    preds = p_buf / (fold_idx + 1)

    # Prepare submission
    subm = pd.DataFrame()
    subm['id'] = id_test
    subm['project_is_approved'] = preds
    subm.to_csv('submission.csv', index=False)


if __name__ == '__main__':
    main()
	# Python script using data from DonorsChoose.org Application Screening
	# Forked from https://www.kaggle.com/opanichev/lightgbm-and-tf-idf-starter

	# Original version: Validation score: 0.7791025740062782
	# - Private Score 0.78470
	# - Public Score 0.79516
	# Tuned version: Validation score: 0.7799395759019241
	# - Private Score 0.78622
	# - Public Score 0.79535

	import gc
	import json

	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics import roc_auc_score
	from sklearn.model_selection import RepeatedKFold
	from sklearn.preprocessing import LabelEncoder
	from tqdm import tqdm
	import lightgbm as lgb_original
	import optuna.integration.lightgbm as lgb


	# Extract features
	def extract_features(df):
	cols = [
	'project_title',
	'project_essay_1',
	'project_essay_2',
	'project_essay_3',
	'project_essay_4',
	'project_resource_summary',
	]
	for col in cols:
	df[f'{col}_len'] = df[col].apply(lambda x: len(str(x)))
	df[f'{col}_wc'] = df[col].apply(lambda x: len(str(x).split(' ')))


	def process_timestamp(df):
	df['year'] = df['project_submitted_datetime'].apply(
	lambda x: int(x.split('-')[0]))
	df['month'] = df['project_submitted_datetime'].apply(
	lambda x: int(x.split('-')[1]))
	df['date'] = df['project_submitted_datetime'].apply(
	lambda x: int(x.split(' ')[0].split('-')[2]))
	df['day_of_week'] = pd.to_datetime(
	df['project_submitted_datetime']).dt.weekday
	df['hour'] = df['project_submitted_datetime'].apply(
	lambda x: int(x.split(' ')[-1].split(':')[0]))
	df['minute'] = df['project_submitted_datetime'].apply(
	lambda x: int(x.split(' ')[-1].split(':')[1]))
	df['project_submitted_datetime'] = pd.to_datetime(
	df['project_submitted_datetime']).values.astype(np.int64)


	def load_features():
	# Load Data
	dtype = {
	'id': str,
	'teacher_id': str,
	'teacher_prefix': str,
	'school_state': str,
	'project_submitted_datetime': str,
	'project_grade_category': str,
	'project_subject_categories': str,
	'project_subject_subcategories': str,
	'project_title': str,
	'project_essay_1': str,
	'project_essay_2': str,
	'project_essay_3': str,
	'project_essay_4': str,
	'project_resource_summary': str,
	'teacher_number_of_previously_posted_projects': int,
	'project_is_approved': np.uint8,
	}
	train = pd.read_csv('train.csv', dtype=dtype, low_memory=True)
	test = pd.read_csv('test.csv', dtype=dtype, low_memory=True)
	res = pd.read_csv('resources.csv')

	# Preprocess data
	train['project_essay'] = train.apply(lambda row: ' '.join([
	str(row['project_essay_1']),
	str(row['project_essay_2']),
	str(row['project_essay_3']),
	str(row['project_essay_4']),
	]), axis=1)
	test['project_essay'] = test.apply(lambda row: ' '.join([
	str(row['project_essay_1']),
	str(row['project_essay_2']),
	str(row['project_essay_3']),
	str(row['project_essay_4']),
	]), axis=1)

	extract_features(train)
	extract_features(test)

	drop_cols = [
	'project_essay_1',
	'project_essay_2',
	'project_essay_3',
	'project_essay_4',
	]
	train.drop(drop_cols, axis=1, inplace=True)
	test.drop(drop_cols, axis=1, inplace=True)
	df_all = pd.concat([train, test], axis=0, sort=False)
	gc.collect()

	# Merge with resources
	res = pd.DataFrame(res[['id', 'quantity', 'price']].groupby('id').agg({
	'quantity': ['sum', 'min', 'max', 'mean', 'std'],
	'price': [
	'count', 'sum', 'min', 'max', 'mean', 'std',
	lambda x: len(np.unique(x)),
	]}
	)).reset_index()
	res.columns = ['_'.join(col) for col in res.columns]
	res.rename(columns={'id_': 'id'}, inplace=True)
	res['mean_price'] = res['price_sum']/res['quantity_sum']
	# res['price_max_to_price_min'] = res['price_max']/res['price_min']
	# res['quantity_max_to_quantity_min'] = res['quantity_max']/res['quantity_min']

	train = train.merge(res, on='id', how='left')
	test = test.merge(res, on='id', how='left')
	del res
	gc.collect()

	# Preprocess columns with label encoder
	print('Label Encoder...')
	cols = [
	'teacher_id',
	'teacher_prefix',
	'school_state',
	'project_grade_category',
	'project_subject_categories',
	'project_subject_subcategories'
	]

	for c in tqdm(cols):
	le = LabelEncoder()
	le.fit(df_all[c].astype(str))
	train[c] = le.transform(train[c].astype(str))
	test[c] = le.transform(test[c].astype(str))
	del le
	gc.collect()
	print('Done.')

	# Preprocess timestamp
	print('Preprocessing timestamp...')
	process_timestamp(train)
	process_timestamp(test)
	print('Done.')

	# Preprocess text
	print('Preprocessing text...')
	cols_to_vectorize = [
	'project_title',
	'project_essay',
	'project_resource_summary'
	]
	n_features = [400, 4040, 400]

	for c_i, c in tqdm(enumerate(cols_to_vectorize)):
	tfidf = TfidfVectorizer(
	max_features=n_features[c_i],
	norm='l2',
	)
	tfidf.fit(df_all[c])
	tfidf_train = np.array(tfidf.transform(train[c]).toarray(), dtype=np.float16)
	tfidf_test = np.array(tfidf.transform(test[c]).toarray(), dtype=np.float16)

	for i in range(n_features[c_i]):
	train[c + '_tfidf_' + str(i)] = tfidf_train[:, i]
	test[c + '_tfidf_' + str(i)] = tfidf_test[:, i]

	del tfidf, tfidf_train, tfidf_test
	gc.collect()

	print('Done.')
	del df_all
	gc.collect()

	# Prepare data
	cols_to_drop = [
	'id',
	'teacher_id',
	'project_title',
	'project_essay',
	'project_resource_summary',
	'project_is_approved',
	]
	X = train.drop(cols_to_drop, axis=1, errors='ignore')
	y = train['project_is_approved']
	X_test = test.drop(cols_to_drop, axis=1, errors='ignore')
	id_test = test['id'].values
	feature_names = list(X.columns)
	print(X.shape, X_test.shape)

	del train, test
	gc.collect()
	return X, y, X_test, id_test, feature_names


	def main():
	X, y, X_test, id_test, feature_names = load_features()

	# Build the model
	use_first_fold_only = True
	n_splits = 5
	n_repeats = 1
	p_buf = []
	kf = RepeatedKFold(
	n_splits=n_splits,
	n_repeats=n_repeats,
	random_state=0)
	auc_buf = []

	for fold_idx, (train_index, valid_index) in enumerate(kf.split(X)):
	print('Fold {}/{}'.format(fold_idx + 1, n_splits))

	params = {
	'objective': 'binary',
	'metric': 'auc',
	}

	lgb_train = lgb.Dataset(X.loc[train_index],
	y.loc[train_index],
	feature_name=feature_names)

	lgb_valid = lgb.Dataset(X.loc[valid_index],
	y.loc[valid_index])

	best_params = {}
	tuning_history = []

	lgb.train(
	params,
	lgb_train,
	num_boost_round=10000,
	valid_sets=[lgb_train, lgb_valid],
	early_stopping_rounds=100,
	best_params=best_params,
	tuning_history=tuning_history)

	pd.DataFrame(tuning_history).to_csv('./tuning_history.csv')

	print('Best parameters: ' + json.dumps(best_params, indent=4))
	best_params['learning_rate'] = 0.05

	model = lgb_original.train(
	best_params,
	lgb_train,
	num_boost_round=20000,
	valid_sets=[lgb_train, lgb_valid],
	early_stopping_rounds=1000,
	verbose_eval=1000)

	p = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)
	auc = roc_auc_score(y.loc[valid_index], p)
	print('{} AUC: {}'.format(fold_idx, auc))

	p = model.predict(X_test, num_iteration=model.best_iteration)
	if len(p_buf) == 0:
	p_buf = np.array(p, dtype=np.float16)
	else:
	p_buf += np.array(p, dtype=np.float16)
	auc_buf.append(auc)

	# Comment this to run several folds
	if use_first_fold_only:
	break

	del model, lgb_train, lgb_valid, p
	gc.collect

	auc_mean = np.mean(auc_buf)
	auc_std = np.std(auc_buf)
	print('AUC = {:.6f} +/- {:.6f}'.format(auc_mean, auc_std))
	preds = p_buf / (fold_idx + 1)

	# Prepare submission
	subm = pd.DataFrame()
	subm['id'] = id_test
	subm['project_is_approved'] = preds
	subm.to_csv('submission.csv', index=False)


	if __name__ == '__main__':
	main()