Created
December 19, 2016 19:18
-
-
Save ericshape/090e5c26a05a27a88977dca13caa1fbb to your computer and use it in GitHub Desktop.
Flight Prediction Python Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy as sp | |
import pandas as pd | |
import sklearn | |
from matplotlib import pyplot as plt | |
from sklearn import preprocessing | |
from sklearn.cross_validation import cross_val_predict | |
from sklearn import metrics | |
from sklearn.metrics import classification_report | |
from itertools import cycle | |
import random | |
import time | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.svm import SVC | |
from sklearn import tree | |
from sklearn import linear_model | |
from sklearn.ensemble import RandomForestClassifier | |
import xgboost | |
from sklearn.metrics import roc_curve, auc | |
from sklearn.cross_validation import StratifiedKFold | |
from sklearn import preprocessing | |
from collections import defaultdict | |
def predictive_model(df): | |
features = ['DAY_OF_MONTH', | |
'DAY_OF_WEEK', | |
'FL_DATE', | |
'CARRIER', | |
'ORIGIN_AIRPORT_ID', | |
'ORIGIN', | |
'DEST_AIRPORT_ID', | |
'DEST', | |
'CRS_DEP_TIME', | |
'DEP_TIME', | |
'DEP_DELAY', | |
'DEP_DEL15', | |
'DISTANCE'] | |
df.dropna(subset=features, how='any', inplace=True) | |
df.fillna(0, inplace=True) | |
# prepare for y | |
y_delay_min = df["DEP_DELAY"] | |
y = list(df["DEP_DEL15"].values) | |
# carrier list, one hot encoding carrier | |
carrier_list = ["AA", "AS", "B6", "DL", "EV", "F9", "HA", "MQ", "NK", "OO", "UA", "VX", "WN"] | |
df['CARRIER'].fillna(random.choice(carrier_list), inplace=True) | |
le = preprocessing.LabelEncoder() | |
le.fit(carrier_list) | |
x_carrier = le.transform(df['CARRIER']) | |
# process time to hours and minute | |
# try: | |
# df['dep_time'] = pd.to_datetime(df["CRS_DEP_TIME"], format="%H%M") | |
# except ValueError: | |
# | |
# | |
# df['hour'] = pd.Index(df['dep_time']).hour | |
# df['minute'] = pd.Index(df['dep_time']).minute | |
# plot graph and print summary | |
print df.dtypes | |
print df.describe(include='all') | |
# df.plot(kind='box', subplots=True, layout=(5,3), sharex=False, sharey=False, fontsize = 7) | |
# plt.show() | |
# Compute ROC curve and ROC area for method | |
fpr = dict() | |
tpr = dict() | |
roc_auc = dict() | |
# prepare for X | |
X = np.column_stack((x_carrier, df["DAY_OF_MONTH"], df["DAY_OF_WEEK"], | |
df['CRS_DEP_TIME'], df["ORIGIN_AIRPORT_ID"], | |
df["DISTANCE"])) | |
# naive bayes | |
start_time = time.time() | |
clf = GaussianNB() | |
predicted = cross_val_predict(clf, X, y, cv=10) | |
print "NB accuracy:", metrics.accuracy_score(y, predicted) | |
print classification_report(y, predicted, target_names=["not delay", "delay"]) | |
end_time = time.time() | |
print "NB running time (seconds): ", (end_time - start_time) | |
method = "NB" | |
clf.fit(X, y) | |
prob_predict = clf.predict_proba(X) | |
fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1]) | |
roc_auc[method] = auc(fpr[method], tpr[method]) | |
# Logistic Regression L2 | |
start_time = time.time() | |
clf = linear_model.LogisticRegression(penalty='l2') | |
predicted = cross_val_predict(clf, X, y, cv=10) | |
print "LR L2 accuracy:", metrics.accuracy_score(y, predicted) | |
print classification_report(y, predicted, target_names=["not delay", "delay"]) | |
end_time = time.time() | |
print "LR L2 running time (seconds): ", (end_time - start_time) | |
method = "LR L2" | |
clf.fit(X, y) | |
prob_predict = clf.predict_proba(X) | |
fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1]) | |
roc_auc[method] = auc(fpr[method], tpr[method]) | |
# Logistic Regression L1 | |
start_time = time.time() | |
clf = linear_model.LogisticRegression(penalty='l1') | |
predicted = cross_val_predict(clf, X, y, cv=10) | |
print "LR L1 accuracy:", metrics.accuracy_score(y, predicted) | |
print classification_report(y, predicted, target_names=["not delay", "delay"]) | |
end_time = time.time() | |
print "LR L1 running time (seconds): ", (end_time - start_time) | |
method = "LR L1" | |
clf.fit(X, y) | |
prob_predict = clf.predict_proba(X) | |
fpr[method], tpr[method], delta = roc_curve(y, prob_predict[:, 1]) | |
roc_auc[method] = auc(fpr[method], tpr[method]) | |
# Decision Tree | |
start_time = time.time() | |
clf = tree.DecisionTreeClassifier(criterion='entropy') | |
predicted = cross_val_predict(clf, X, y, cv=10) | |
print "Decision Tree accuracy:", metrics.accuracy_score(y, predicted) | |
print classification_report(y, predicted, target_names=["not delay", "delay"]) | |
end_time = time.time() | |
print "Decision Tree running time (seconds): ", (end_time - start_time) | |
method = "Decision Tree" | |
clf.fit(X, y) | |
prob_predict = clf.predict_proba(X) | |
fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1]) | |
roc_auc[method] = auc(fpr[method], tpr[method]) | |
# Random Forest | |
start_time = time.time() | |
clf = RandomForestClassifier(criterion='entropy', n_estimators=50) | |
predicted = cross_val_predict(clf, X, y, cv=10) | |
print "Random Forest accuracy:", metrics.accuracy_score(y, predicted) | |
print classification_report(y, predicted, target_names=["not delay", "delay"]) | |
end_time = time.time() | |
print "Random Forest running time (seconds): ", (end_time - start_time) | |
method = "Random Forest" | |
clf.fit(X, y) | |
importances = clf.feature_importances_ | |
print importances | |
std = np.std([item.feature_importances_ for item in clf.estimators_], | |
axis=0) | |
print np.argsort(importances) | |
indices = np.argsort(importances)[::-1] | |
# Print the feature ranking | |
print("Feature ranking:") | |
# for f in range(X.shape[1]): | |
# print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) | |
# Plot the feature importances of the forest | |
plt.figure() | |
plt.title("Feature importances") | |
plt.bar(range(X.shape[1]), importances[indices], | |
color="r", yerr=std[indices], align="center") | |
plt.xticks(range(X.shape[1]), indices) | |
plt.xlim([-1, X.shape[1]]) | |
plt.savefig("feature_importance.png") | |
# plt.show() | |
prob_predict = clf.predict_proba(X) | |
fpr[method], tpr[method], delta = roc_curve(y, prob_predict[:, 1]) | |
roc_auc[method] = auc(fpr[method], tpr[method]) | |
# XGBoost | |
start_time = time.time() | |
clf = xgboost.XGBClassifier(n_estimators=100) | |
predicted = cross_val_predict(clf, X, y, cv=10) | |
print "XGBoost accuracy:", metrics.accuracy_score(y, predicted) | |
print classification_report(y, predicted, target_names=["not delay", "delay"]) | |
end_time = time.time() | |
print "XGBoost running time (seconds): ", (end_time - start_time) | |
method = "XGBoost" | |
clf.fit(X, y) | |
prob_predict = clf.predict_proba(X) | |
fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1]) | |
roc_auc[method] = auc(fpr[method], tpr[method]) | |
# # SVM | |
# start_time = time.time() | |
# clf = SVC(probability=True, class_weight={0: 9, 1: 1}) | |
# predicted = cross_val_predict(clf, X, y, cv=10) | |
# print "SVM accuracy:", metrics.accuracy_score(y, predicted) | |
# print classification_report(y, predicted, target_names=["not delay", "delay"]) | |
# end_time = time.time() | |
# print "SVM running time (seconds): ", (end_time - start_time) | |
# | |
# method = "SVM" | |
# clf.fit(X, y) | |
# prob_predict = clf.predict_proba(X) | |
# fpr[method], tpr[method], _ = roc_curve(y, prob_predict[:, 1]) | |
# roc_auc[method] = auc(fpr[method], tpr[method]) | |
# Plot all ROC curves | |
plt.figure() | |
lw = 2 | |
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange', 'navy']) | |
for method, color in zip(fpr, colors): | |
plt.plot(fpr[method], tpr[method], | |
label=method + '(area = {0:0.2f})' | |
''.format(roc_auc[method]), | |
color=color, linestyle='--', linewidth=4) | |
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--') | |
plt.xlim([0.0, 1.0]) | |
plt.ylim([0.0, 1.05]) | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.title('Receiver operating characteristic example') | |
plt.legend(loc="lower right") | |
plt.savefig("roc_auc.png") | |
plt.show() | |
def main(): | |
# read csv data via pandas | |
filename = "2015_12_predict.csv" | |
df = pd.read_csv(filename) | |
df.drop(df.columns[(13)], axis= 1, inplace= True) | |
# print the data description info | |
# print data.head() | |
predictive_model(df) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment