Skip to content

Instantly share code, notes, and snippets.

@sourabhxyz
Created November 10, 2018 09:02
Show Gist options
  • Save sourabhxyz/12578ea35d1725b765efd3ebd5430d79 to your computer and use it in GitHub Desktop.
Save sourabhxyz/12578ea35d1725b765efd3ebd5430d79 to your computer and use it in GitHub Desktop.
ML Mini Project - Model 5
# Took time of 1227.5 seconds (20 minutes 27 seconds) on kaggle server...
# Score:
# Private: 0.59736
# Public: 0.62892
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import math
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
eps = 1e-6
""" The following function takes timestamp and get normalised time from it """
def getMH (x):
x = datetime.fromtimestamp (x)
y = x.hour + (x.minute) / 60 + x.second / 3600
return y
""" The following function takes a 'row' of a data frame and uses its features "StartTime" and "Len" to calculate the normalised "EndTime" of the trip. Also note that, time greater than 24 is not representable in a clock and hence such values are further normalised. """
def getEndTime (row_):
st = row_['StartTime']
len_ = row_['Len']
endTime = st + (len_ * 15) / 3600
while (endTime > 24 - eps):
endTime = endTime - 24
return endTime
""" Given the row and a snapshot, determine whether the snapshot lies withn the trip """
def satisfy_ (row_, snap_): # Assumption, each trip is less than 24 hours
st = row_['StartTime']
et = row_['EndTime']
ret_ = False
if (st < et):
if (st < snap_ and snap_ < et): # Case 1
ret_ = True
else:
# Case 2 (including mid night)
if ((snap_ >= st and snap_ <= 24) or (snap_ >= 0 and snap_ <= et)):
ret_ = True
return ret_
""" Given the row of a data frame and a particluar snapshot, it will return the latitude and longitude corresponding to that snapshot """
def getCutLonLat (row_, snap_):
st = row_['StartTime']
pos_ = math.ceil ((snap_ - st) / 15)
pos_ = max (0, min (pos_, len (row_['POLYLINE']) - 1)) # to avoid out of bounds position
at = row_['POLYLINE'][pos_]
return (at[0], at[1])
def Drop_ (df):
df.drop ("TRIP_ID", axis = 1, inplace = True)
df.drop ("TAXI_ID", axis = 1, inplace = True)
df.drop ("DAY_TYPE", axis = 1, inplace = True)
df.drop ("MISSING_DATA", axis = 1, inplace = True)
snaps = [18.0, 8.5, 17.75, 4.0, 14.5]
def getClosest (at_):
dif_ = 1000
ans_ = -1
for i in range (len (snaps)):
if (abs (snaps[i] - at_) < dif_):
dif_ = abs (snaps[i] - at_)
ans_ = i
return ans_
test = pd.read_csv ('../input/test.csv')
train = pd.read_csv ('../input/train.csv')
sub = pd.DataFrame ()
sub['TRIP_ID'] = test.TRIP_ID
Drop_ (test)
test['POLYLINE'] = test['POLYLINE'].apply(json.loads)
train['POLYLINE'] = train['POLYLINE'].apply(json.loads)
train['Len'] = train.POLYLINE.apply (lambda x : len (x) - 1)
test['Len'] = test.POLYLINE.apply (lambda x : len (x) - 1)
train = train[train['MISSING_DATA'] == False] # removing insignificant data (as its amount is very low)
train = train[train['Len'] > 12] # removing short trips
Drop_ (train)
test['StartTime'] = test.TIMESTAMP.apply (getMH)
test['StartLon'] = test.POLYLINE.apply (lambda x : x[0][0])
test['StartLat'] = test.POLYLINE.apply (lambda x : x[0][1])
test['CutLon'] = test.POLYLINE.apply (lambda x : x[len (x) - 1][0])
test['CutLat'] = test.POLYLINE.apply (lambda x : x[len (x) - 1][1])
test['EndTime'] = test.apply (getEndTime, axis = 1)
heuristic = pd.DataFrame ()
heuristic['Guess'] = test.Len.apply (lambda x : x * 15 + 500)
heuristic['Len'] = test['Len']
test.drop ("POLYLINE", axis = 1, inplace = True)
test.drop ("Len", axis = 1, inplace = True)
testEndTime = test.loc[:, 'EndTime']
test.drop ("EndTime", axis = 1, inplace = True)
test.drop ("TIMESTAMP", axis = 1, inplace = True)
print (test.head ())
train['StartTime'] = train.TIMESTAMP.apply (getMH)
train['StartLon'] = train.POLYLINE.apply (lambda x : x[0][0])
train['StartLat'] = train.POLYLINE.apply (lambda x : x[0][1])
train['EndTime'] = train.apply (getEndTime, axis = 1)
trainSets = [i for i in range (len (snaps))]
for i in range (len (snaps)):
train['temp'] = train.apply (satisfy_, axis = 1, snap_ = snaps[i])
trainSets[i] = train[train['temp'] == True]
trainSets[i].drop ('temp', axis = 1, inplace = True)
trainSets[i]['CutLonLat'] = trainSets[i].apply (getCutLonLat, axis = 1, snap_ = snaps[i])
trainSets[i].drop ('EndTime', axis = 1, inplace = True)
trainSets[i].drop ("TIMESTAMP", axis = 1, inplace = True)
trainSets[i].drop ('POLYLINE', axis = 1, inplace = True)
trainSets[i]['CutLon'] = trainSets[i].CutLonLat.apply (lambda x : x[0])
trainSets[i]['CutLat'] = trainSets[i].CutLonLat.apply (lambda x : x[1])
trainSets[i].drop ('CutLonLat', axis = 1, inplace = True)
trainSets[i]['Len'] = trainSets[i].Len.apply (lambda x : x * 15)
print (trainSets[0].head ())
newTrainSets = [i for i in range (15)]
for i in range (len (snaps)): # for the five training sets, further dividing them
newTrainSets[i * 3] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'A']
newTrainSets[i * 3].drop ('CALL_TYPE', axis = 1, inplace = True)
newTrainSets[i * 3].drop ('ORIGIN_STAND', axis = 1, inplace = True)
newTrainSets[i * 3 + 1] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'B']
newTrainSets[i * 3 + 1].drop ('CALL_TYPE', axis = 1, inplace = True)
newTrainSets[i * 3 + 1].drop ('ORIGIN_CALL', axis = 1, inplace = True)
newTrainSets[i * 3 + 1].drop (index = newTrainSets[i * 3 + 1].loc[newTrainSets[i * 3 + 1]['ORIGIN_STAND'].isnull ()].index, inplace = True)
newTrainSets[i * 3 + 2] = trainSets[i][trainSets[i]['CALL_TYPE'] == 'C']
newTrainSets[i * 3 + 2].drop ('CALL_TYPE', axis = 1, inplace = True)
newTrainSets[i * 3 + 2].drop ('ORIGIN_CALL', axis = 1, inplace = True)
newTrainSets[i * 3 + 2].drop ('ORIGIN_STAND', axis = 1, inplace = True)
# print ("----")
# print (newTrainSets[i * 3].head ())
# print (newTrainSets[i * 3 + 1].head ())
# print (newTrainSets[i * 3 + 2].head ())
models = [i for i in range (15)]
for i in range (15):
models[i] = RandomForestRegressor (n_estimators = 10, max_depth = 7, random_state = 0)
# models[i] = xgboost.XGBRegressor ()
if (i % 3 != 2):
X = newTrainSets[i].iloc[:, [0, 2, 3, 4, 5, 6]].values
y = newTrainSets[i].iloc[:, 1].values
else:
X = newTrainSets[i].iloc[:, [1, 2, 3, 4, 5]].values
y = newTrainSets[i].iloc[:, 0].values
models[i].fit (X, y)
out_ = [i for i in range (test.shape[0])]
for i in range (test.shape[0]):
at = test.iloc[i, 0] #.values #.reshape (1, -1)
offset_ = 0
if (at == 'A'):
at = test.iloc[i, [1, 3, 4, 5, 6, 7]].values.reshape (1, -1)
elif (at == 'B'):
at = test.iloc[i, [2, 3, 4, 5, 6, 7]].values.reshape (1, -1)
offset_ = 1
else:
at = test.iloc[i, [3, 4, 5, 6, 7]].values.reshape (1, -1)
offset_ = 2
index_ = getClosest (testEndTime[i])
index_ = index_ * 3 + offset_
out_[i] = models[index_].predict (at)
out_[i] = out_[i][0]
if (out_[i] > 5000):
out_[i] = heuristic.iloc[i, 0]
elif (heuristic.iloc[i, 1] < 27):
out_[i] = heuristic.iloc[i, 0]
sub['TRAVEL_TIME'] = pd.Series (out_)
sub.to_csv ('Model5.csv', index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment