Created
October 6, 2019 20:13
-
-
Save akatasonov/fc5f031791a3ad0344bb78272008de4f to your computer and use it in GitHub Desktop.
A2 Flights solution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
from sklearn.model_selection import train_test_split, StratifiedKFold | |
from sklearn.metrics import classification_report, confusion_matrix, log_loss, accuracy_score, roc_auc_score | |
import numpy as np | |
from catboost import CatBoostClassifier, Pool, cv | |
from datetime import date, timedelta | |
#import shap | |
import matplotlib.pyplot as pl | |
#from imblearn.over_sampling import SMOTENC | |
#pd.set_option('display.max_columns', None) | |
# experimenting or submitting? | |
XP_MODE = True | |
# CPU or GPU? | |
TASK_TYPE = 'CPU' | |
# random seed to use everywhere we need it | |
SEED = 17 | |
# how many CV folds to do on the data | |
N_FOLDS = 5 | |
# max number of rows to use for X and y. to reduce time and compare options faster | |
MAX_N = None | |
# verbosity in catboost is how often progress is printed. with 100=print progress every 100 rounds. 0 is quite? | |
VERBOSITY = 200 | |
# def is_holiday(row): | |
# global us_holidays | |
# dep_day = int(row['DayofMonth'][2:]) | |
# dep_month = int(row['Month'][2:]) | |
# dep_date = date(2018, dep_month, dep_day) | |
# delta1 = timedelta(days=1) | |
# delta2 = timedelta(days=2) | |
# if (dep_date in us_holidays) or \ | |
# ((dep_date - delta1) in us_holidays) or \ | |
# ((dep_date + delta1) in us_holidays) or \ | |
# ((dep_date - delta2) in us_holidays) or \ | |
# ((dep_date + delta2) in us_holidays): | |
# return 1 | |
# else: | |
# return 0 | |
# Function to check the input data and make some conversions | |
def convertStrings(row): | |
row['Month'] = int(row['Month'][2:]) | |
row['DayofMonth'] = int(row['DayofMonth'][2:]) | |
row['DayOfWeek'] = int(row['DayOfWeek'][2:]) | |
assert row['Month']>=1 and row['Month']<=12, "Invalid month %i" %row['Month'] | |
if row['Month'] in [1,3,5,7,8,10,12]: | |
assert row['DayofMonth']>=1 and row['DayofMonth']<=31, "Invalid day of month %i for month %i" %(row['DayofMonth'],row['Month']) | |
elif row['Month'] in [4,6,9,11]: | |
assert row['DayofMonth']>=1 and row['DayofMonth']<=30, "Invalid day of month %i for month %i" %(row['DayofMonth'],row['Month']) | |
else: | |
assert row['DayofMonth']>=1 and row['DayofMonth']<=29, "Invalid day of month %i for month %i" %(row['DayofMonth'],row['Month']) | |
assert row['DayOfWeek']>=1 and row['DayOfWeek']<=7, "Invalid day of week %i" %row['DayOfWeek'] | |
assert row['DepTime']>=0 and row['DepTime']<=2600, "Invalid hour %i" %row['DepTime'] | |
if row['DepTime']>=2400: | |
hour = row['DepTime'] - 2400 | |
print('Out of range hour %i mapped to %i' %(row['DepTime'], hour)) | |
#row['DepTime'] = hour | |
assert row['Distance'] > 0, "Invalid distance %i" %row['Distance'] | |
#row['dep_delayed_15min'] = row['dep_delayed_15min']=="Y" | |
return row | |
train_df = pd.read_csv(os.path.dirname(__file__) + '/flight_delays_train.csv') | |
test_df = pd.read_csv(os.path.dirname(__file__) + '/flight_delays_test.csv') | |
df_delay = train_df[train_df.dep_delayed_15min=='Y'] | |
df_nodelay = train_df[train_df.dep_delayed_15min=='N'] | |
delays = len(df_delay) | |
nodelays = len(df_nodelay) | |
airlines = sorted(train_df.UniqueCarrier.unique()) | |
origins = sorted(train_df.Origin.unique(), reverse=True) | |
destinations = sorted(train_df.Dest.unique(), reverse=True) | |
numAirlines = len(airlines) | |
numOrigins = len(origins) | |
numDestinations = len(destinations) | |
minDist = train_df.Distance.min() | |
maxDist = train_df.Distance.max() | |
print(' ') | |
print('Number of delays = %i' %delays) | |
print('Number of no-delays = %i' %nodelays) | |
print('Number of airlines = %i' %numAirlines) | |
print('Number of origins = %i' %numOrigins) | |
print('Number of destinations = %i' %numDestinations) | |
print('Minimum distance = %f' %minDist) | |
print('Maximum distance = %f' %maxDist) | |
# Concatenate for preprocessing | |
y = train_df.pop('dep_delayed_15min') | |
y = y.map({'N': 0, 'Y': 1}) | |
train_split = train_df.shape[0] | |
df = pd.concat((train_df, test_df), sort=False) | |
# The data is imbalanced (19% delayed, 81% not delayed) | |
# Try to downsample to balance | |
# train_pos = train_df[train_df['dep_delayed_15min']=='Y'] | |
# train_neg = train_df[train_df['dep_delayed_15min']=='N'] | |
# train_neg = train_neg.sample(int(train_df.shape[0] * 0.2), random_state=SEED) | |
# train_df = pd.concat([train_pos, train_neg]).sort_index() | |
# ADD features | |
# airports = Airports() | |
# cc = coco.CountryConverter() | |
#us_holidays = holidays.CountryHoliday('US') | |
#train_df = train_df.apply(convertStrings, axis=1, raw=True, reduce=None) | |
#test_df = test_df.apply(convertStrings, axis=1, raw=True, reduce=None) | |
# Convert from string to int | |
df['Month'] = df['Month'].str[2:].astype('int') | |
df['DayofMonth'] = df['DayofMonth'].str[2:].astype('int') | |
df['DayOfWeek'] = df['DayOfWeek'].str[2:].astype('int') | |
# Departure hour | |
df['DepHour'] = df['DepTime'] // 100 | |
df['DepHour'].replace(to_replace=[24], value=0, inplace=True) | |
df['DepHour'].replace(to_replace=[25], value=1, inplace=True) | |
# Departure minute | |
df['DepMin'] = df['DepTime'] % 100 | |
# CarrierFlight, i.e. carrier+origin+destination | |
# df['CaFlight'] = df['UniqueCarrier'] + \ | |
# ':' + df['Origin'] + '->' + df['Dest'] | |
# Experiment with holidays, airlines seem to be from USA | |
# df['IsHoliday'] = df.apply(lambda row: is_holiday(row), axis=1) | |
# Short distance? | |
# df['ShortDist'] = df['Distance'] < 1000 | |
# Some airports are busy | |
df['Origin_ORD'] = (df['Origin'] == 'ORD').astype('int') | |
df['Origin_EWR'] = (df['Origin'] == 'EWR').astype('int') | |
df['Origin_ATL'] = (df['Origin'] == 'ATL').astype('int') | |
df['Origin_HNL'] = (df['Origin'] == 'HNL').astype('int') | |
df['Dest_HNL'] = (df['Dest'] == 'HNL').astype('int') | |
df['Dest_IAH'] = (df['Dest'] == 'IAH').astype('int') | |
df['Dest_EWR'] = (df['Dest'] == 'EWR').astype('int') | |
df['Dest_SLC'] = (df['Dest'] == 'SLC').astype('int') | |
df['Dest_CVG'] = (df['Dest'] == 'CVG').astype('int') | |
df['Dest_DFW'] = (df['Dest'] == 'DFW').astype('int') | |
# Some airlines suck | |
df['UniqueCarrier_EV'] = (df['UniqueCarrier'] == 'EV').astype('int') | |
df['UniqueCarrier_HA'] = (df['UniqueCarrier'] == 'HA').astype('int') | |
df['UniqueCarrier_WN'] = (df['UniqueCarrier'] == 'WN').astype('int') | |
df['UniqueCarrier_FL'] = (df['UniqueCarrier'] == 'FL').astype('int') | |
df['UniqueCarrier_AS'] = (df['UniqueCarrier'] == 'AS').astype('int') | |
# Second half of day | |
df['SecondHalfOfDay'] = (df['DepHour'] > 12).astype('int') | |
# Second half of weeek | |
df['SecondHalfOfWeek'] = df['DayOfWeek'].isin([4, 5]).astype('int') | |
# Second half of month | |
df['SecondHalfOfMonth'] = ((df['DayofMonth'] >= 15) & (df['DayofMonth'] < 29)).astype('int') | |
# mod 5 stupid | |
df['Mod5'] = (df['DepMin'] % 5 != 0).astype('int') | |
# Flight | |
df['Flight'] = df['Origin'] + df['Dest'] | |
# Daytime | |
#df['DayTime'] = pd.cut(df['DepHour'], bins=[0, 6, 12, 18, 23], include_lowest=True) | |
# Seasons | |
df['Summer'] = df['Month'].isin([6, 7, 8]).astype('int') | |
df['December'] = df['Month'].isin([12]).astype('int') | |
df['March'] = df['Month'].isin([3]).astype('int') | |
# Label Encoding | |
# for col in ['Origin', 'Dest', 'UniqueCarrier']: | |
# df[col] = pd.factorize(df[col])[0] | |
cat_features_names = ['Month', 'DayofMonth', 'DayOfWeek', 'DepHour', 'DepMin', | |
'UniqueCarrier', 'Origin', 'Dest', | |
'Summer', 'December', 'March', | |
'SecondHalfOfDay', 'SecondHalfOfWeek', 'SecondHalfOfMonth', | |
'Origin_ORD', 'Origin_EWR', 'Origin_ATL', 'Origin_HNL', | |
'Dest_HNL', 'Dest_IAH', 'Dest_EWR', 'Dest_SLC','Dest_CVG', | |
'Dest_DFW', | |
'UniqueCarrier_EV', 'UniqueCarrier_HA', 'UniqueCarrier_WN', | |
'UniqueCarrier_FL', 'UniqueCarrier_AS', 'Mod5', 'Flight' | |
] | |
# Converting categorical columns to type 'category' as required by LGBM | |
# Catboost? | |
for c in cat_features_names: | |
df[c] = df[c].astype('category') | |
# FINISHED PROCESSING features | |
train_df, test_df = df.iloc[:train_split], df.iloc[train_split:] | |
#print(train_df.shape, test_df.shape) | |
print(train_df.sample(10)) | |
#print('Y value distribution') | |
# print(train_df['dep_delayed_15min'].describe()) | |
print('Are ther NAs?') | |
print(train_df.isna().any()) | |
# df_t = pd.crosstab(train_df.DepHour, train_df.dep_delayed_15min) | |
# print(df_t.head()) | |
X = train_df | |
X_test = test_df | |
cat_features = [X.columns.get_loc(col) for col in cat_features_names] | |
ignored_features_names = [] | |
ignored_features = [X.columns.get_loc(col) for col in ignored_features_names] | |
params = { | |
'loss_function': 'Logloss', # objective function | |
'eval_metric': 'AUC', # metric | |
'ignored_features': ignored_features, | |
'verbose': VERBOSITY, # output to stdout info about training process every 200 iterations | |
'random_seed': SEED, | |
'task_type': TASK_TYPE, | |
'iterations': 5000, | |
'boosting_type': 'Ordered', | |
#'depth': 10, | |
# 'border_count': 254, | |
'use_best_model': True, | |
'early_stopping_rounds': 200, | |
} | |
if XP_MODE: | |
X_train, X_valid, y_train, y_valid = train_test_split( | |
X, y, test_size=0.2, random_state=SEED) | |
# sm = SMOTENC(categorical_features=cat_features, random_state=SEED) | |
# X_train, y_train = sm.fit_sample(X_train, y_train) | |
train_data = Pool( | |
data=X_train, | |
label=y_train, | |
cat_features=cat_features, | |
) | |
valid_data = Pool( | |
data=X_valid, | |
label=y_valid, | |
cat_features=cat_features, | |
) | |
ctb = CatBoostClassifier(**params) | |
ctb.fit( | |
train_data, | |
eval_set=valid_data, # data to validate on | |
# True if we don't want to save trees created after iteration with the best validation score | |
use_best_model=True, | |
# True for visualization of the training process (it is not shown in a published kernel - try executing this code) | |
plot=False | |
) | |
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1] | |
print('ROC AUC: {}'.format(roc_auc_score(y_valid, ctb_valid_pred))) | |
feature_importances = ctb.get_feature_importance(train_data) | |
feature_names = X_train.columns | |
for score, name in sorted(zip(feature_importances, feature_names), reverse=True): | |
print('{}: {}'.format(name, score)) | |
#explainer = shap.TreeExplainer(ctb) | |
#shap_values = explainer.shap_values(train_data) | |
# shap.initjs() | |
#shap.summary_plot(shap_values, X_train, show=False) | |
# pl.savefig('summary_plot.pdf') | |
else: | |
# Submission | |
folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) | |
test_data = Pool(data=X_test, cat_features=cat_features) | |
scores = [] | |
prediction = np.zeros(X_test.shape[0]) | |
for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)): | |
# train and validation data splits | |
print('Training fold #{}'.format(fold_n)) | |
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index] | |
y_train, y_valid = y[train_index], y[valid_index] | |
train_data = Pool(data=X_train, | |
label=y_train, | |
cat_features=cat_features) | |
valid_data = Pool(data=X_valid, | |
label=y_valid, | |
cat_features=cat_features) | |
model = CatBoostClassifier(**params) | |
model.fit(train_data, | |
eval_set=valid_data, | |
use_best_model=True | |
) | |
score = model.get_best_score()['validation']['AUC'] | |
scores.append(score) | |
y_pred = model.predict_proba(test_data)[:, 1] | |
prediction += y_pred | |
prediction /= N_FOLDS | |
print('CV mean: {:.4f}, CV std: {:.4f}'.format( | |
np.mean(scores), np.std(scores))) | |
sub = pd.read_csv('sample_submission.csv', index_col='id') | |
sub['dep_delayed_15min'] = prediction | |
sub_name = 'catboost_submission.csv' | |
sub.to_csv(sub_name, index=True) | |
print('Saving submission file as: {}'.format(sub_name)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment