Skip to content

Instantly share code, notes, and snippets.

@akatasonov
Created October 6, 2019 20:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save akatasonov/fc5f031791a3ad0344bb78272008de4f to your computer and use it in GitHub Desktop.
Save akatasonov/fc5f031791a3ad0344bb78272008de4f to your computer and use it in GitHub Desktop.
A2 Flights solution
import os
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, log_loss, accuracy_score, roc_auc_score
import numpy as np
from catboost import CatBoostClassifier, Pool, cv
from datetime import date, timedelta
#import shap
import matplotlib.pyplot as pl
#from imblearn.over_sampling import SMOTENC
#pd.set_option('display.max_columns', None)
# experimenting or submitting?
XP_MODE = True
# CPU or GPU?
TASK_TYPE = 'CPU'
# random seed to use everywhere we need it
SEED = 17
# how many CV folds to do on the data
N_FOLDS = 5
# max number of rows to use for X and y. to reduce time and compare options faster
MAX_N = None
# verbosity in catboost is how often progress is printed. with 100=print progress every 100 rounds. 0 is quite?
VERBOSITY = 200
# def is_holiday(row):
# global us_holidays
# dep_day = int(row['DayofMonth'][2:])
# dep_month = int(row['Month'][2:])
# dep_date = date(2018, dep_month, dep_day)
# delta1 = timedelta(days=1)
# delta2 = timedelta(days=2)
# if (dep_date in us_holidays) or \
# ((dep_date - delta1) in us_holidays) or \
# ((dep_date + delta1) in us_holidays) or \
# ((dep_date - delta2) in us_holidays) or \
# ((dep_date + delta2) in us_holidays):
# return 1
# else:
# return 0
# Function to check the input data and make some conversions
def convertStrings(row):
row['Month'] = int(row['Month'][2:])
row['DayofMonth'] = int(row['DayofMonth'][2:])
row['DayOfWeek'] = int(row['DayOfWeek'][2:])
assert row['Month']>=1 and row['Month']<=12, "Invalid month %i" %row['Month']
if row['Month'] in [1,3,5,7,8,10,12]:
assert row['DayofMonth']>=1 and row['DayofMonth']<=31, "Invalid day of month %i for month %i" %(row['DayofMonth'],row['Month'])
elif row['Month'] in [4,6,9,11]:
assert row['DayofMonth']>=1 and row['DayofMonth']<=30, "Invalid day of month %i for month %i" %(row['DayofMonth'],row['Month'])
else:
assert row['DayofMonth']>=1 and row['DayofMonth']<=29, "Invalid day of month %i for month %i" %(row['DayofMonth'],row['Month'])
assert row['DayOfWeek']>=1 and row['DayOfWeek']<=7, "Invalid day of week %i" %row['DayOfWeek']
assert row['DepTime']>=0 and row['DepTime']<=2600, "Invalid hour %i" %row['DepTime']
if row['DepTime']>=2400:
hour = row['DepTime'] - 2400
print('Out of range hour %i mapped to %i' %(row['DepTime'], hour))
#row['DepTime'] = hour
assert row['Distance'] > 0, "Invalid distance %i" %row['Distance']
#row['dep_delayed_15min'] = row['dep_delayed_15min']=="Y"
return row
train_df = pd.read_csv(os.path.dirname(__file__) + '/flight_delays_train.csv')
test_df = pd.read_csv(os.path.dirname(__file__) + '/flight_delays_test.csv')
df_delay = train_df[train_df.dep_delayed_15min=='Y']
df_nodelay = train_df[train_df.dep_delayed_15min=='N']
delays = len(df_delay)
nodelays = len(df_nodelay)
airlines = sorted(train_df.UniqueCarrier.unique())
origins = sorted(train_df.Origin.unique(), reverse=True)
destinations = sorted(train_df.Dest.unique(), reverse=True)
numAirlines = len(airlines)
numOrigins = len(origins)
numDestinations = len(destinations)
minDist = train_df.Distance.min()
maxDist = train_df.Distance.max()
print(' ')
print('Number of delays = %i' %delays)
print('Number of no-delays = %i' %nodelays)
print('Number of airlines = %i' %numAirlines)
print('Number of origins = %i' %numOrigins)
print('Number of destinations = %i' %numDestinations)
print('Minimum distance = %f' %minDist)
print('Maximum distance = %f' %maxDist)
# Concatenate for preprocessing
y = train_df.pop('dep_delayed_15min')
y = y.map({'N': 0, 'Y': 1})
train_split = train_df.shape[0]
df = pd.concat((train_df, test_df), sort=False)
# The data is imbalanced (19% delayed, 81% not delayed)
# Try to downsample to balance
# train_pos = train_df[train_df['dep_delayed_15min']=='Y']
# train_neg = train_df[train_df['dep_delayed_15min']=='N']
# train_neg = train_neg.sample(int(train_df.shape[0] * 0.2), random_state=SEED)
# train_df = pd.concat([train_pos, train_neg]).sort_index()
# ADD features
# airports = Airports()
# cc = coco.CountryConverter()
#us_holidays = holidays.CountryHoliday('US')
#train_df = train_df.apply(convertStrings, axis=1, raw=True, reduce=None)
#test_df = test_df.apply(convertStrings, axis=1, raw=True, reduce=None)
# Convert from string to int
df['Month'] = df['Month'].str[2:].astype('int')
df['DayofMonth'] = df['DayofMonth'].str[2:].astype('int')
df['DayOfWeek'] = df['DayOfWeek'].str[2:].astype('int')
# Departure hour
df['DepHour'] = df['DepTime'] // 100
df['DepHour'].replace(to_replace=[24], value=0, inplace=True)
df['DepHour'].replace(to_replace=[25], value=1, inplace=True)
# Departure minute
df['DepMin'] = df['DepTime'] % 100
# CarrierFlight, i.e. carrier+origin+destination
# df['CaFlight'] = df['UniqueCarrier'] + \
# ':' + df['Origin'] + '->' + df['Dest']
# Experiment with holidays, airlines seem to be from USA
# df['IsHoliday'] = df.apply(lambda row: is_holiday(row), axis=1)
# Short distance?
# df['ShortDist'] = df['Distance'] < 1000
# Some airports are busy
df['Origin_ORD'] = (df['Origin'] == 'ORD').astype('int')
df['Origin_EWR'] = (df['Origin'] == 'EWR').astype('int')
df['Origin_ATL'] = (df['Origin'] == 'ATL').astype('int')
df['Origin_HNL'] = (df['Origin'] == 'HNL').astype('int')
df['Dest_HNL'] = (df['Dest'] == 'HNL').astype('int')
df['Dest_IAH'] = (df['Dest'] == 'IAH').astype('int')
df['Dest_EWR'] = (df['Dest'] == 'EWR').astype('int')
df['Dest_SLC'] = (df['Dest'] == 'SLC').astype('int')
df['Dest_CVG'] = (df['Dest'] == 'CVG').astype('int')
df['Dest_DFW'] = (df['Dest'] == 'DFW').astype('int')
# Some airlines suck
df['UniqueCarrier_EV'] = (df['UniqueCarrier'] == 'EV').astype('int')
df['UniqueCarrier_HA'] = (df['UniqueCarrier'] == 'HA').astype('int')
df['UniqueCarrier_WN'] = (df['UniqueCarrier'] == 'WN').astype('int')
df['UniqueCarrier_FL'] = (df['UniqueCarrier'] == 'FL').astype('int')
df['UniqueCarrier_AS'] = (df['UniqueCarrier'] == 'AS').astype('int')
# Second half of day
df['SecondHalfOfDay'] = (df['DepHour'] > 12).astype('int')
# Second half of weeek
df['SecondHalfOfWeek'] = df['DayOfWeek'].isin([4, 5]).astype('int')
# Second half of month
df['SecondHalfOfMonth'] = ((df['DayofMonth'] >= 15) & (df['DayofMonth'] < 29)).astype('int')
# mod 5 stupid
df['Mod5'] = (df['DepMin'] % 5 != 0).astype('int')
# Flight
df['Flight'] = df['Origin'] + df['Dest']
# Daytime
#df['DayTime'] = pd.cut(df['DepHour'], bins=[0, 6, 12, 18, 23], include_lowest=True)
# Seasons
df['Summer'] = df['Month'].isin([6, 7, 8]).astype('int')
df['December'] = df['Month'].isin([12]).astype('int')
df['March'] = df['Month'].isin([3]).astype('int')
# Label Encoding
# for col in ['Origin', 'Dest', 'UniqueCarrier']:
# df[col] = pd.factorize(df[col])[0]
cat_features_names = ['Month', 'DayofMonth', 'DayOfWeek', 'DepHour', 'DepMin',
'UniqueCarrier', 'Origin', 'Dest',
'Summer', 'December', 'March',
'SecondHalfOfDay', 'SecondHalfOfWeek', 'SecondHalfOfMonth',
'Origin_ORD', 'Origin_EWR', 'Origin_ATL', 'Origin_HNL',
'Dest_HNL', 'Dest_IAH', 'Dest_EWR', 'Dest_SLC','Dest_CVG',
'Dest_DFW',
'UniqueCarrier_EV', 'UniqueCarrier_HA', 'UniqueCarrier_WN',
'UniqueCarrier_FL', 'UniqueCarrier_AS', 'Mod5', 'Flight'
]
# Converting categorical columns to type 'category' as required by LGBM
# Catboost?
for c in cat_features_names:
df[c] = df[c].astype('category')
# FINISHED PROCESSING features
train_df, test_df = df.iloc[:train_split], df.iloc[train_split:]
#print(train_df.shape, test_df.shape)
print(train_df.sample(10))
#print('Y value distribution')
# print(train_df['dep_delayed_15min'].describe())
print('Are ther NAs?')
print(train_df.isna().any())
# df_t = pd.crosstab(train_df.DepHour, train_df.dep_delayed_15min)
# print(df_t.head())
X = train_df
X_test = test_df
cat_features = [X.columns.get_loc(col) for col in cat_features_names]
ignored_features_names = []
ignored_features = [X.columns.get_loc(col) for col in ignored_features_names]
params = {
'loss_function': 'Logloss', # objective function
'eval_metric': 'AUC', # metric
'ignored_features': ignored_features,
'verbose': VERBOSITY, # output to stdout info about training process every 200 iterations
'random_seed': SEED,
'task_type': TASK_TYPE,
'iterations': 5000,
'boosting_type': 'Ordered',
#'depth': 10,
# 'border_count': 254,
'use_best_model': True,
'early_stopping_rounds': 200,
}
if XP_MODE:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size=0.2, random_state=SEED)
# sm = SMOTENC(categorical_features=cat_features, random_state=SEED)
# X_train, y_train = sm.fit_sample(X_train, y_train)
train_data = Pool(
data=X_train,
label=y_train,
cat_features=cat_features,
)
valid_data = Pool(
data=X_valid,
label=y_valid,
cat_features=cat_features,
)
ctb = CatBoostClassifier(**params)
ctb.fit(
train_data,
eval_set=valid_data, # data to validate on
# True if we don't want to save trees created after iteration with the best validation score
use_best_model=True,
# True for visualization of the training process (it is not shown in a published kernel - try executing this code)
plot=False
)
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]
print('ROC AUC: {}'.format(roc_auc_score(y_valid, ctb_valid_pred)))
feature_importances = ctb.get_feature_importance(train_data)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
print('{}: {}'.format(name, score))
#explainer = shap.TreeExplainer(ctb)
#shap_values = explainer.shap_values(train_data)
# shap.initjs()
#shap.summary_plot(shap_values, X_train, show=False)
# pl.savefig('summary_plot.pdf')
else:
# Submission
folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
test_data = Pool(data=X_test, cat_features=cat_features)
scores = []
prediction = np.zeros(X_test.shape[0])
for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
# train and validation data splits
print('Training fold #{}'.format(fold_n))
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
train_data = Pool(data=X_train,
label=y_train,
cat_features=cat_features)
valid_data = Pool(data=X_valid,
label=y_valid,
cat_features=cat_features)
model = CatBoostClassifier(**params)
model.fit(train_data,
eval_set=valid_data,
use_best_model=True
)
score = model.get_best_score()['validation']['AUC']
scores.append(score)
y_pred = model.predict_proba(test_data)[:, 1]
prediction += y_pred
prediction /= N_FOLDS
print('CV mean: {:.4f}, CV std: {:.4f}'.format(
np.mean(scores), np.std(scores)))
sub = pd.read_csv('sample_submission.csv', index_col='id')
sub['dep_delayed_15min'] = prediction
sub_name = 'catboost_submission.csv'
sub.to_csv(sub_name, index=True)
print('Saving submission file as: {}'.format(sub_name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment