Created
November 10, 2018 09:59
-
-
Save sourabhxyz/c459b3bf4a6641d177a9b8c024e3f263 to your computer and use it in GitHub Desktop.
ML Mini Project - Model 6, 7, 8, 9
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import json | |
import math | |
from datetime import datetime | |
from sklearn.ensemble import RandomForestRegressor | |
import xgboost | |
eps = 1e-6 | |
def getMH (x): # get normalised time from time | |
x = datetime.fromtimestamp (x) | |
y = x.hour + (x.minute) / 60 + x.second / 3600 | |
return y | |
def getDay (x): # return weekday as an integer | |
x = datetime.fromtimestamp (x) | |
return x.weekday () | |
def getEndTime (row_): | |
st = row_['StartTime'] | |
len_ = row_['Len'] | |
endTime = st + (len_ * 15) / 3600 | |
while (endTime > 24 - eps): | |
endTime = endTime - 24 | |
return endTime | |
def correlateTime (row_): | |
st = row_['StartTime'] | |
len_ = row_['Len'] | |
return st / (len_ + 1) | |
def haverSine (row_): | |
slat = row_['StartLat'] | |
slon = row_['StartLon'] | |
clat = row_['CutLat'] | |
clon = row_['CutLon'] | |
lonDiff = np.abs (clon - slon) * np.pi / 360 | |
latDiff = np.abs (clat - slat) * np.pi / 360 | |
a = np.sin (latDiff)**2 + np.cos (clat * np.pi / 180) * np.cos (slat * np.pi / 180) * np.sin(lonDiff)**2 | |
d = 2 * 6371 * np.arctan2 (np.sqrt(a), np.sqrt (1 - a)) | |
return d | |
def satisfy_ (row_, snap_): # Assumption, each trip is less than 24 hours | |
st = row_['StartTime'] | |
et = row_['EndTime'] | |
ret_ = False | |
if (st < et): | |
if (st < snap_ and snap_ < et): | |
ret_ = True | |
else: | |
if ((snap_ >= st and snap_ <= 24) or (snap_ >= 0 and snap_ <= et)): | |
ret_ = True | |
return ret_ | |
def getCutLonLat (row_, snap_): | |
st = row_['StartTime'] | |
pos_ = math.ceil ((snap_ - st) / 15) | |
pos_ = max (0, min (pos_, len (row_['POLYLINE']) - 1)) | |
at = row_['POLYLINE'][pos_] | |
return (at[0], at[1]) | |
def getDistance (row_, snap_): | |
st = row_['StartTime'] | |
pos_ = math.ceil ((snap_ - st) / 15) | |
pos_ = max (0, min (pos_, len (row_['POLYLINE']) - 1)) | |
return pos_ + 1 | |
def Drop_ (df): | |
df.drop ("TRIP_ID", axis = 1, inplace = True) | |
df.drop ("TAXI_ID", axis = 1, inplace = True) | |
df.drop ("DAY_TYPE", axis = 1, inplace = True) | |
df.drop ("MISSING_DATA", axis = 1, inplace = True) | |
snaps = [18.0, 8.5, 17.75, 4.0, 14.5] | |
def getClosest (at_): | |
dif_ = 1000 | |
ans_ = -1 | |
for i in range (len (snaps)): | |
if (abs (snaps[i] - at_) < dif_): | |
dif_ = abs (snaps[i] - at_) | |
ans_ = i | |
return ans_ | |
test = pd.read_csv ('../input/test.csv') | |
train = pd.read_csv ('../input/train.csv') | |
sub1 = pd.DataFrame () | |
sub2 = pd.DataFrame () | |
sub1['TRIP_ID'] = test.TRIP_ID | |
sub2['TRIP_ID'] = test.TRIP_ID | |
sub3 = pd.DataFrame () | |
sub4 = pd.DataFrame () | |
sub3['TRIP_ID'] = test.TRIP_ID | |
sub4['TRIP_ID'] = test.TRIP_ID | |
Drop_ (test) | |
test['POLYLINE'] = test['POLYLINE'].apply(json.loads) | |
train['POLYLINE'] = train['POLYLINE'].apply(json.loads) | |
train['Len'] = train.POLYLINE.apply (lambda x : len (x) - 1) | |
test['Len'] = test.POLYLINE.apply (lambda x : len (x) - 1) | |
train = train[train['MISSING_DATA'] == False] # removing insignificant data (as its amount is very low) | |
train = train[train['Len'] > 7] # removing short trips | |
train = train[train['Len'] < 480] # removing long trips | |
Drop_ (train) | |
test['StartTime'] = test.TIMESTAMP.apply (getMH) | |
test['StartLon'] = test.POLYLINE.apply (lambda x : x[0][0]) | |
test['StartLat'] = test.POLYLINE.apply (lambda x : x[0][1]) | |
test['CutLon'] = test.POLYLINE.apply (lambda x : x[len (x) - 1][0]) | |
test['CutLat'] = test.POLYLINE.apply (lambda x : x[len (x) - 1][1]) | |
test['DiffLon'] = test['CutLon'] - test['StartLon'] | |
test['DiffLat'] = test['CutLat'] - test['StartLat'] | |
test['EndTime'] = test.apply (getEndTime, axis = 1) | |
heuristic = pd.DataFrame () | |
heuristic['Guess'] = test.Len.apply (lambda x : x * 15 + 500) | |
heuristic['Len'] = test['Len'] | |
test['StartTime'] = test.apply (correlateTime, axis = 1) | |
test['Distance'] = test.apply (haverSine, axis = 1) | |
test['WeekDay'] = test.TIMESTAMP.apply (getDay) | |
test.drop ("POLYLINE", axis = 1, inplace = True) | |
test.drop ("Len", axis = 1, inplace = True) | |
testEndTime = test.loc[:, 'EndTime'] | |
test.drop ("EndTime", axis = 1, inplace = True) | |
test.drop ("TIMESTAMP", axis = 1, inplace = True) | |
print (test.head ()) | |
train['StartTime'] = train.TIMESTAMP.apply (getMH) | |
train['StartLon'] = train.POLYLINE.apply (lambda x : x[0][0]) | |
train['StartLat'] = train.POLYLINE.apply (lambda x : x[0][1]) | |
train['EndTime'] = train.apply (getEndTime, axis = 1) | |
trainSets = [i for i in range (len (snaps))] | |
for i in range (len (snaps)): | |
train['temp'] = train.apply (satisfy_, axis = 1, snap_ = snaps[i]) | |
trainSets[i] = train[train['temp'] == True] | |
trainSets[i].drop ('temp', axis = 1, inplace = True) | |
trainSets[i]['CutLonLat'] = trainSets[i].apply (getCutLonLat, axis = 1, snap_ = snaps[i]) | |
trainSets[i]['StartTime'] = train.apply (correlateTime, axis = 1) | |
trainSets[i].drop ('EndTime', axis = 1, inplace = True) | |
trainSets[i].drop ('POLYLINE', axis = 1, inplace = True) | |
trainSets[i]['CutLon'] = trainSets[i].CutLonLat.apply (lambda x : x[0]) | |
trainSets[i]['CutLat'] = trainSets[i].CutLonLat.apply (lambda x : x[1]) | |
trainSets[i].drop ('CutLonLat', axis = 1, inplace = True) | |
trainSets[i]['Len'] = trainSets[i].Len.apply (lambda x : x * 15) | |
trainSets[i]['DiffLon'] = trainSets[i]['CutLon'] - trainSets[i]['StartLon'] | |
trainSets[i]['DiffLat'] = trainSets[i]['CutLat'] - trainSets[i]['StartLat'] | |
trainSets[i]['Distance'] = trainSets[i].apply (haverSine, axis = 1) | |
trainSets[i]['WeekDay'] = train.TIMESTAMP.apply (getDay) | |
trainSets[i].drop ("TIMESTAMP", axis = 1, inplace = True) | |
print (trainSets[0].head ()) | |
newTrainSets = [i for i in range (15)] | |
for i in range (len (snaps)): # for the five training sets, further dividing them | |
newTrainSets[i * 3] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'A'] | |
newTrainSets[i * 3].drop ('CALL_TYPE', axis = 1, inplace = True) | |
newTrainSets[i * 3].drop ('ORIGIN_STAND', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 1] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'B'] | |
newTrainSets[i * 3 + 1].drop ('CALL_TYPE', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 1].drop ('ORIGIN_CALL', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 2] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'C'] | |
newTrainSets[i * 3 + 2].drop ('CALL_TYPE', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 2].drop ('ORIGIN_CALL', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 2].drop ('ORIGIN_STAND', axis = 1, inplace = True) | |
# print ("----") | |
# print (newTrainSets[i * 3].head ()) | |
# print (newTrainSets[i * 3 + 1].head ()) | |
# print (newTrainSets[i * 3 + 2].head ()) | |
models = [i for i in range (15)] | |
for i in range (15): | |
models[i] = xgboost.XGBRegressor () | |
#models[i] = RandomForestRegressor (n_estimators = 10, max_depth = 7, random_state = 0) | |
if (i % 3 != 2): | |
X = newTrainSets[i].iloc[:, [0, 2, 3, 4, 5, 6, 7, 8, 9, 10]].values | |
y = newTrainSets[i].iloc[:, 1].values | |
else: | |
X = newTrainSets[i].iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9]].values | |
y = newTrainSets[i].iloc[:, 0].values | |
models[i].fit (X, y) | |
out_ = [i for i in range (test.shape[0])] | |
for i in range (test.shape[0]): | |
at = test.iloc[i, 0] #.values #.reshape (1, -1) | |
offset_ = 0 | |
if (at == 'A'): | |
at = test.iloc[i, [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
elif (at == 'B'): | |
at = test.iloc[i, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
offset_ = 1 | |
else: | |
at = test.iloc[i, [3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
offset_ = 2 | |
index_ = getClosest (testEndTime[i]) | |
index_ = index_ * 3 + offset_ | |
out_[i] = models[index_].predict (at) | |
out_[i] = out_[i][0] | |
sub1['TRAVEL_TIME'] = pd.Series (out_) | |
sub1.to_csv ('xgBoostWithoutHeu.csv', index = False) | |
for i in range (test.shape[0]): | |
at = test.iloc[i, 0] #.values #.reshape (1, -1) | |
offset_ = 0 | |
if (at == 'A'): | |
at = test.iloc[i, [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
elif (at == 'B'): | |
at = test.iloc[i, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
offset_ = 1 | |
else: | |
at = test.iloc[i, [3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
offset_ = 2 | |
index_ = getClosest (testEndTime[i]) | |
index_ = index_ * 3 + offset_ | |
out_[i] = models[index_].predict (at) | |
out_[i] = out_[i][0] | |
if (out_[i] > 5000): | |
out_[i] = heuristic.iloc[i, 0] | |
elif (heuristic.iloc[i, 1] < 27): | |
out_[i] = heuristic.iloc[i, 0] | |
sub3['TRAVEL_TIME'] = pd.Series (out_) | |
sub3.to_csv ('xgBoostWithHeu.csv', index = False) | |
for i in range (len (snaps)): | |
newTrainSets[i * 3 + 1].drop (index = newTrainSets[i * 3 + 1].loc[newTrainSets[i * 3 + 1]['ORIGIN_STAND'].isnull ()].index, inplace = True) | |
models = [i for i in range (15)] | |
for i in range (15): | |
#models[i] = xgboost.XGBRegressor () | |
models[i] = RandomForestRegressor (n_estimators = 10, max_depth = 7, random_state = 0) | |
if (i % 3 != 2): | |
X = newTrainSets[i].iloc[:, [0, 2, 3, 4, 5, 6, 7, 8, 9, 10]].values | |
y = newTrainSets[i].iloc[:, 1].values | |
else: | |
X = newTrainSets[i].iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9]].values | |
y = newTrainSets[i].iloc[:, 0].values | |
models[i].fit (X, y) | |
out_ = [i for i in range (test.shape[0])] | |
for i in range (test.shape[0]): | |
at = test.iloc[i, 0] #.values #.reshape (1, -1) | |
offset_ = 0 | |
if (at == 'A'): | |
at = test.iloc[i, [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
elif (at == 'B'): | |
at = test.iloc[i, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
offset_ = 1 | |
else: | |
at = test.iloc[i, [3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
offset_ = 2 | |
index_ = getClosest (testEndTime[i]) | |
index_ = index_ * 3 + offset_ | |
out_[i] = models[index_].predict (at) | |
out_[i] = out_[i][0] | |
sub2['TRAVEL_TIME'] = pd.Series (out_) | |
sub2.to_csv ('RForestWithoutHeu.csv', index = False) | |
for i in range (test.shape[0]): | |
at = test.iloc[i, 0] #.values #.reshape (1, -1) | |
offset_ = 0 | |
if (at == 'A'): | |
at = test.iloc[i, [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
elif (at == 'B'): | |
at = test.iloc[i, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
offset_ = 1 | |
else: | |
at = test.iloc[i, [3, 4, 5, 6, 7, 8, 9, 10, 11]].values.reshape (1, -1) | |
offset_ = 2 | |
index_ = getClosest (testEndTime[i]) | |
index_ = index_ * 3 + offset_ | |
out_[i] = models[index_].predict (at) | |
out_[i] = out_[i][0] | |
if (out_[i] > 5000): | |
out_[i] = heuristic.iloc[i, 0] | |
elif (heuristic.iloc[i, 1] < 27): | |
out_[i] = heuristic.iloc[i, 0] | |
sub4['TRAVEL_TIME'] = pd.Series (out_) | |
sub4.to_csv ('RForestWithHeu.csv', index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment