Skip to content

Instantly share code, notes, and snippets.

@ericshape
Created December 19, 2016 19:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericshape/090e5c26a05a27a88977dca13caa1fbb to your computer and use it in GitHub Desktop.
Save ericshape/090e5c26a05a27a88977dca13caa1fbb to your computer and use it in GitHub Desktop.
Flight Prediction Python Code
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_predict
from sklearn import metrics
from sklearn.metrics import classification_report
from itertools import cycle
import random
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
from sklearn import preprocessing
from collections import defaultdict
def predictive_model(df):
features = ['DAY_OF_MONTH',
'DAY_OF_WEEK',
'FL_DATE',
'CARRIER',
'ORIGIN_AIRPORT_ID',
'ORIGIN',
'DEST_AIRPORT_ID',
'DEST',
'CRS_DEP_TIME',
'DEP_TIME',
'DEP_DELAY',
'DEP_DEL15',
'DISTANCE']
df.dropna(subset=features, how='any', inplace=True)
df.fillna(0, inplace=True)
# prepare for y
y_delay_min = df["DEP_DELAY"]
y = list(df["DEP_DEL15"].values)
# carrier list, one hot encoding carrier
carrier_list = ["AA", "AS", "B6", "DL", "EV", "F9", "HA", "MQ", "NK", "OO", "UA", "VX", "WN"]
df['CARRIER'].fillna(random.choice(carrier_list), inplace=True)
le = preprocessing.LabelEncoder()
le.fit(carrier_list)
x_carrier = le.transform(df['CARRIER'])
# process time to hours and minute
# try:
# df['dep_time'] = pd.to_datetime(df["CRS_DEP_TIME"], format="%H%M")
# except ValueError:
#
#
# df['hour'] = pd.Index(df['dep_time']).hour
# df['minute'] = pd.Index(df['dep_time']).minute
# plot graph and print summary
print df.dtypes
print df.describe(include='all')
# df.plot(kind='box', subplots=True, layout=(5,3), sharex=False, sharey=False, fontsize = 7)
# plt.show()
# Compute ROC curve and ROC area for method
fpr = dict()
tpr = dict()
roc_auc = dict()
# prepare for X
X = np.column_stack((x_carrier, df["DAY_OF_MONTH"], df["DAY_OF_WEEK"],
df['CRS_DEP_TIME'], df["ORIGIN_AIRPORT_ID"],
df["DISTANCE"]))
# naive bayes
start_time = time.time()
clf = GaussianNB()
predicted = cross_val_predict(clf, X, y, cv=10)
print "NB accuracy:", metrics.accuracy_score(y, predicted)
print classification_report(y, predicted, target_names=["not delay", "delay"])
end_time = time.time()
print "NB running time (seconds): ", (end_time - start_time)
method = "NB"
clf.fit(X, y)
prob_predict = clf.predict_proba(X)
fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1])
roc_auc[method] = auc(fpr[method], tpr[method])
# Logistic Regression L2
start_time = time.time()
clf = linear_model.LogisticRegression(penalty='l2')
predicted = cross_val_predict(clf, X, y, cv=10)
print "LR L2 accuracy:", metrics.accuracy_score(y, predicted)
print classification_report(y, predicted, target_names=["not delay", "delay"])
end_time = time.time()
print "LR L2 running time (seconds): ", (end_time - start_time)
method = "LR L2"
clf.fit(X, y)
prob_predict = clf.predict_proba(X)
fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1])
roc_auc[method] = auc(fpr[method], tpr[method])
# Logistic Regression L1
start_time = time.time()
clf = linear_model.LogisticRegression(penalty='l1')
predicted = cross_val_predict(clf, X, y, cv=10)
print "LR L1 accuracy:", metrics.accuracy_score(y, predicted)
print classification_report(y, predicted, target_names=["not delay", "delay"])
end_time = time.time()
print "LR L1 running time (seconds): ", (end_time - start_time)
method = "LR L1"
clf.fit(X, y)
prob_predict = clf.predict_proba(X)
fpr[method], tpr[method], delta = roc_curve(y, prob_predict[:, 1])
roc_auc[method] = auc(fpr[method], tpr[method])
# Decision Tree
start_time = time.time()
clf = tree.DecisionTreeClassifier(criterion='entropy')
predicted = cross_val_predict(clf, X, y, cv=10)
print "Decision Tree accuracy:", metrics.accuracy_score(y, predicted)
print classification_report(y, predicted, target_names=["not delay", "delay"])
end_time = time.time()
print "Decision Tree running time (seconds): ", (end_time - start_time)
method = "Decision Tree"
clf.fit(X, y)
prob_predict = clf.predict_proba(X)
fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1])
roc_auc[method] = auc(fpr[method], tpr[method])
# Random Forest
start_time = time.time()
clf = RandomForestClassifier(criterion='entropy', n_estimators=50)
predicted = cross_val_predict(clf, X, y, cv=10)
print "Random Forest accuracy:", metrics.accuracy_score(y, predicted)
print classification_report(y, predicted, target_names=["not delay", "delay"])
end_time = time.time()
print "Random Forest running time (seconds): ", (end_time - start_time)
method = "Random Forest"
clf.fit(X, y)
importances = clf.feature_importances_
print importances
std = np.std([item.feature_importances_ for item in clf.estimators_],
axis=0)
print np.argsort(importances)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
# for f in range(X.shape[1]):
# print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.savefig("feature_importance.png")
# plt.show()
prob_predict = clf.predict_proba(X)
fpr[method], tpr[method], delta = roc_curve(y, prob_predict[:, 1])
roc_auc[method] = auc(fpr[method], tpr[method])
# XGBoost
start_time = time.time()
clf = xgboost.XGBClassifier(n_estimators=100)
predicted = cross_val_predict(clf, X, y, cv=10)
print "XGBoost accuracy:", metrics.accuracy_score(y, predicted)
print classification_report(y, predicted, target_names=["not delay", "delay"])
end_time = time.time()
print "XGBoost running time (seconds): ", (end_time - start_time)
method = "XGBoost"
clf.fit(X, y)
prob_predict = clf.predict_proba(X)
fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1])
roc_auc[method] = auc(fpr[method], tpr[method])
# # SVM
# start_time = time.time()
# clf = SVC(probability=True, class_weight={0: 9, 1: 1})
# predicted = cross_val_predict(clf, X, y, cv=10)
# print "SVM accuracy:", metrics.accuracy_score(y, predicted)
# print classification_report(y, predicted, target_names=["not delay", "delay"])
# end_time = time.time()
# print "SVM running time (seconds): ", (end_time - start_time)
#
# method = "SVM"
# clf.fit(X, y)
# prob_predict = clf.predict_proba(X)
# fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1])
# roc_auc[method] = auc(fpr[method], tpr[method])
# Plot all ROC curves
plt.figure()
lw = 2
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange', 'navy'])
for method, color in zip(fpr, colors):
plt.plot(fpr[method], tpr[method],
label=method + '(area = {0:0.2f})'
''.format(roc_auc[method]),
color=color, linestyle='--', linewidth=4)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.savefig("roc_auc.png")
plt.show()
def main():
# read csv data via pandas
filename = "2015_12_predict.csv"
df = pd.read_csv(filename)
df.drop(df.columns[(13)], axis= 1, inplace= True)
# print the data description info
# print data.head()
predictive_model(df)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment