Skip to content

Instantly share code, notes, and snippets.

@jkotra
Created August 29, 2018 17:19
Show Gist options
  • Save jkotra/f418d01f584606e67ad16768aab6d710 to your computer and use it in GitHub Desktop.
Save jkotra/f418d01f584606e67ad16768aab6d710 to your computer and use it in GitHub Desktop.
taxi-preprocessing
train = train[train['fare_amount'] > 0]
train = train[train['pickup_longitude'] < -72]
train = train[(train['pickup_latitude'] > 40) & (train['pickup_latitude'] < 44)]
train = train[train['dropoff_longitude'] < -72]
train = train[(train['dropoff_latitude'] > 40) & (train['dropoff_latitude'] < 44)]
train = train[(train['passenger_count'] > 0) & (train['passenger_count'] < 10)]
#---#
from math import sin, cos, sqrt, atan2, radians
def quick_dist_calc(df):
R = 6373.0
for i,row in df.iterrows():
lat1 = radians(row['pickup_latitude'])
lon1 = radians(row['pickup_longitude'])
lat2 = radians(row['dropoff_latitude'])
lon2 = radians(row['dropoff_longitude'])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
df.at[i,'distance'] = distance
def quick_dist_calc_loc(df, c1, c2, cname):
R = 6373.0
for i,row in df.iterrows():
lat1 = radians(row['pickup_latitude'])
lon1 = radians(row['pickup_longitude'])
lat2 = radians(row['dropoff_latitude'])
lon2 = radians(row['dropoff_longitude'])
lat3 = radians(c1)
lon3 = radians(c2)
dlon1 = lon3 - lon1
dlon2 = lon3 - lon2
dlat1 = lat3 - lat1
dlat2 = lat3 - lat2
dlon = lon2 - lon1
dlat = lat2 - lat1
ap = sin(dlat1 / 2)**2 + cos(lat3) * cos(lat1) * sin(dlon1 / 2)**2
cp = 2 * atan2(sqrt(ap), sqrt(1 - ap))
ad = sin(dlat2 / 2)**2 + cos(lat3) * cos(lat2) * sin(dlon2 / 2)**2
cd = 2 * atan2(sqrt(ad), sqrt(1 - ad))
distance_p = R * cp
distance_d = R * cd
df.at[i,cname + '_pickup_dist'] = distance_p
df.at[i,cname + '_dropoff_dist'] = distance_d
quick_dist_calc(train)
quick_dist_calc(test)
jfk_airport = (-73.785193, 40.645972)
laguardia_airport = (-73.872925, 40.773335)
newark_airport = (-74.184156, 40.692764)
manhattan = (-73.983132, 40.759006)
quick_dist_calc_loc(train,jfk_airport[1],jfk_airport[0],'jfk_airport')
quick_dist_calc_loc(train,laguardia_airport[1],laguardia_airport[0],'laguardia_airport')
quick_dist_calc_loc(train,newark_airport[1],newark_airport[0],'newark_airport')
quick_dist_calc_loc(train,manhattan[1],manhattan[0],'manhattan')
quick_dist_calc_loc(test,jfk_airport[1],jfk_airport[0],'jfk_airport')
quick_dist_calc_loc(test,laguardia_airport[1],laguardia_airport[0],'laguardia_airport')
quick_dist_calc_loc(test,newark_airport[1],newark_airport[0],'newark_airport')
quick_dist_calc_loc(test,manhattan[1],manhattan[0],'manhattan')
from datetime import datetime
def timestamp(df):
for i,row in df.iterrows():
ts = row['key'][:19]
dto = datetime.strptime(ts, '%Y-%m-%d %H:%M:%S')
df.at[i,'timestamp'] = "{}-{}-{} {}:{}:{}".format(dto.year,dto.month,dto.day,dto.hour,dto.minute,dto.second)
df.at[i,'year'] = ts[:4]
if dto.hour in [13,14,15,16,17,18,19,20,21]:
df.at[i,'ph'] = 1
else:
df.at[i,'ph'] = 0
if dto.weekday() in [1,2,3,4,5]:
df.at[i,'wd'] = 1
else:
df.at[i,'wd'] = 0
timestamp(test)
timestamp(train)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment