Created
November 10, 2018 09:02
-
-
Save sourabhxyz/12578ea35d1725b765efd3ebd5430d79 to your computer and use it in GitHub Desktop.
ML Mini Project - Model 5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Took time of 1227.5 seconds (20 minutes 27 seconds) on kaggle server... | |
# Score: | |
# Private: 0.59736 | |
# Public: 0.62892 | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import json | |
import math | |
from datetime import datetime | |
from sklearn.ensemble import RandomForestRegressor | |
eps = 1e-6 | |
""" The following function takes timestamp and get normalised time from it """ | |
def getMH (x): | |
x = datetime.fromtimestamp (x) | |
y = x.hour + (x.minute) / 60 + x.second / 3600 | |
return y | |
""" The following function takes a 'row' of a data frame and uses its features "StartTime" and "Len" to calculate the normalised "EndTime" of the trip. Also note that, time greater than 24 is not representable in a clock and hence such values are further normalised. """ | |
def getEndTime (row_): | |
st = row_['StartTime'] | |
len_ = row_['Len'] | |
endTime = st + (len_ * 15) / 3600 | |
while (endTime > 24 - eps): | |
endTime = endTime - 24 | |
return endTime | |
""" Given the row and a snapshot, determine whether the snapshot lies withn the trip """ | |
def satisfy_ (row_, snap_): # Assumption, each trip is less than 24 hours | |
st = row_['StartTime'] | |
et = row_['EndTime'] | |
ret_ = False | |
if (st < et): | |
if (st < snap_ and snap_ < et): # Case 1 | |
ret_ = True | |
else: | |
# Case 2 (including mid night) | |
if ((snap_ >= st and snap_ <= 24) or (snap_ >= 0 and snap_ <= et)): | |
ret_ = True | |
return ret_ | |
""" Given the row of a data frame and a particluar snapshot, it will return the latitude and longitude corresponding to that snapshot """ | |
def getCutLonLat (row_, snap_): | |
st = row_['StartTime'] | |
pos_ = math.ceil ((snap_ - st) / 15) | |
pos_ = max (0, min (pos_, len (row_['POLYLINE']) - 1)) # to avoid out of bounds position | |
at = row_['POLYLINE'][pos_] | |
return (at[0], at[1]) | |
def Drop_ (df): | |
df.drop ("TRIP_ID", axis = 1, inplace = True) | |
df.drop ("TAXI_ID", axis = 1, inplace = True) | |
df.drop ("DAY_TYPE", axis = 1, inplace = True) | |
df.drop ("MISSING_DATA", axis = 1, inplace = True) | |
snaps = [18.0, 8.5, 17.75, 4.0, 14.5] | |
def getClosest (at_): | |
dif_ = 1000 | |
ans_ = -1 | |
for i in range (len (snaps)): | |
if (abs (snaps[i] - at_) < dif_): | |
dif_ = abs (snaps[i] - at_) | |
ans_ = i | |
return ans_ | |
test = pd.read_csv ('../input/test.csv') | |
train = pd.read_csv ('../input/train.csv') | |
sub = pd.DataFrame () | |
sub['TRIP_ID'] = test.TRIP_ID | |
Drop_ (test) | |
test['POLYLINE'] = test['POLYLINE'].apply(json.loads) | |
train['POLYLINE'] = train['POLYLINE'].apply(json.loads) | |
train['Len'] = train.POLYLINE.apply (lambda x : len (x) - 1) | |
test['Len'] = test.POLYLINE.apply (lambda x : len (x) - 1) | |
train = train[train['MISSING_DATA'] == False] # removing insignificant data (as its amount is very low) | |
train = train[train['Len'] > 12] # removing short trips | |
Drop_ (train) | |
test['StartTime'] = test.TIMESTAMP.apply (getMH) | |
test['StartLon'] = test.POLYLINE.apply (lambda x : x[0][0]) | |
test['StartLat'] = test.POLYLINE.apply (lambda x : x[0][1]) | |
test['CutLon'] = test.POLYLINE.apply (lambda x : x[len (x) - 1][0]) | |
test['CutLat'] = test.POLYLINE.apply (lambda x : x[len (x) - 1][1]) | |
test['EndTime'] = test.apply (getEndTime, axis = 1) | |
heuristic = pd.DataFrame () | |
heuristic['Guess'] = test.Len.apply (lambda x : x * 15 + 500) | |
heuristic['Len'] = test['Len'] | |
test.drop ("POLYLINE", axis = 1, inplace = True) | |
test.drop ("Len", axis = 1, inplace = True) | |
testEndTime = test.loc[:, 'EndTime'] | |
test.drop ("EndTime", axis = 1, inplace = True) | |
test.drop ("TIMESTAMP", axis = 1, inplace = True) | |
print (test.head ()) | |
train['StartTime'] = train.TIMESTAMP.apply (getMH) | |
train['StartLon'] = train.POLYLINE.apply (lambda x : x[0][0]) | |
train['StartLat'] = train.POLYLINE.apply (lambda x : x[0][1]) | |
train['EndTime'] = train.apply (getEndTime, axis = 1) | |
trainSets = [i for i in range (len (snaps))] | |
for i in range (len (snaps)): | |
train['temp'] = train.apply (satisfy_, axis = 1, snap_ = snaps[i]) | |
trainSets[i] = train[train['temp'] == True] | |
trainSets[i].drop ('temp', axis = 1, inplace = True) | |
trainSets[i]['CutLonLat'] = trainSets[i].apply (getCutLonLat, axis = 1, snap_ = snaps[i]) | |
trainSets[i].drop ('EndTime', axis = 1, inplace = True) | |
trainSets[i].drop ("TIMESTAMP", axis = 1, inplace = True) | |
trainSets[i].drop ('POLYLINE', axis = 1, inplace = True) | |
trainSets[i]['CutLon'] = trainSets[i].CutLonLat.apply (lambda x : x[0]) | |
trainSets[i]['CutLat'] = trainSets[i].CutLonLat.apply (lambda x : x[1]) | |
trainSets[i].drop ('CutLonLat', axis = 1, inplace = True) | |
trainSets[i]['Len'] = trainSets[i].Len.apply (lambda x : x * 15) | |
print (trainSets[0].head ()) | |
newTrainSets = [i for i in range (15)] | |
for i in range (len (snaps)): # for the five training sets, further dividing them | |
newTrainSets[i * 3] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'A'] | |
newTrainSets[i * 3].drop ('CALL_TYPE', axis = 1, inplace = True) | |
newTrainSets[i * 3].drop ('ORIGIN_STAND', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 1] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'B'] | |
newTrainSets[i * 3 + 1].drop ('CALL_TYPE', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 1].drop ('ORIGIN_CALL', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 1].drop (index = newTrainSets[i * 3 + 1].loc[newTrainSets[i * 3 + 1]['ORIGIN_STAND'].isnull ()].index, inplace = True) | |
newTrainSets[i * 3 + 2] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'C'] | |
newTrainSets[i * 3 + 2].drop ('CALL_TYPE', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 2].drop ('ORIGIN_CALL', axis = 1, inplace = True) | |
newTrainSets[i * 3 + 2].drop ('ORIGIN_STAND', axis = 1, inplace = True) | |
# print ("----") | |
# print (newTrainSets[i * 3].head ()) | |
# print (newTrainSets[i * 3 + 1].head ()) | |
# print (newTrainSets[i * 3 + 2].head ()) | |
models = [i for i in range (15)] | |
for i in range (15): | |
models[i] = RandomForestRegressor (n_estimators = 10, max_depth = 7, random_state = 0) | |
# models[i] = xgboost.XGBRegressor () | |
if (i % 3 != 2): | |
X = newTrainSets[i].iloc[:, [0, 2, 3, 4, 5, 6]].values | |
y = newTrainSets[i].iloc[:, 1].values | |
else: | |
X = newTrainSets[i].iloc[:, [1, 2, 3, 4, 5]].values | |
y = newTrainSets[i].iloc[:, 0].values | |
models[i].fit (X, y) | |
out_ = [i for i in range (test.shape[0])] | |
for i in range (test.shape[0]): | |
at = test.iloc[i, 0] #.values #.reshape (1, -1) | |
offset_ = 0 | |
if (at == 'A'): | |
at = test.iloc[i, [1, 3, 4, 5, 6, 7]].values.reshape (1, -1) | |
elif (at == 'B'): | |
at = test.iloc[i, [2, 3, 4, 5, 6, 7]].values.reshape (1, -1) | |
offset_ = 1 | |
else: | |
at = test.iloc[i, [3, 4, 5, 6, 7]].values.reshape (1, -1) | |
offset_ = 2 | |
index_ = getClosest (testEndTime[i]) | |
index_ = index_ * 3 + offset_ | |
out_[i] = models[index_].predict (at) | |
out_[i] = out_[i][0] | |
if (out_[i] > 5000): | |
out_[i] = heuristic.iloc[i, 0] | |
elif (heuristic.iloc[i, 1] < 27): | |
out_[i] = heuristic.iloc[i, 0] | |
sub['TRAVEL_TIME'] = pd.Series (out_) | |
sub.to_csv ('Model5.csv', index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment