Skip to content

Instantly share code, notes, and snippets.

@shuozhang1985
Last active July 21, 2019 21:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shuozhang1985/c1acdbee962b412e4127be02bd05d6a1 to your computer and use it in GitHub Desktop.
Save shuozhang1985/c1acdbee962b412e4127be02bd05d6a1 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
#### yellow taxi: feature selection
os.chdir("/Users/shuozhang/Desktop/capstone/yellow taxi")
yellow_raw=pd.read_csv("yellow_tripdata_2014-05.csv", index_col=False)
yellow_raw.columns = [c.replace(" ", "") for c in yellow_raw.columns] # remove space in column name
yellow=yellow_raw[['pickup_datetime', 'pickup_latitude', 'pickup_longitude']]
yellow.columns=['Date', 'Lat', 'Lon'] # rename for future merge
yellow["Type"]="yellow" # EDA analysis label
yellow["Date"]=yellow["Date"].str[0:13].replace("\s+","-", regex=True)
#### green taxi: feature selection
os.chdir("/Users/shuozhang/Desktop/capstone/green taxi")
green_raw=pd.read_csv("green_tripdata_2014-05.csv", index_col=False)
green_raw.columns=[c.replace(" ", "") for c in green_raw.columns]
green_raw['lpep_pickup_datetime']=green_raw['VendorID'] # the first 3 columns are mislabeled
green_raw.index=np.arange(green_raw.shape[0]) # index is wrong
green=green_raw[['lpep_pickup_datetime', 'Pickup_longitude', 'Pickup_latitude']]
green.columns=['Date', 'Lon', 'Lat']
green["Type"]="green"
green["Date"]=green["Date"].str[0:13].replace("\s+","-", regex=True)
#### uber: feature selection
os.chdir("/Users/shuozhang/Desktop/capstone/uber", index_col=False)
uber=pd.read_csv("uber-raw-data-may14.csv")
uber.columns=['Date', 'Lat', 'Lon', 'Base']
uber.drop(['Base'], axis=1, inplace=True)
uber["Type"]="uber"
uber["Date"]=pd.to_datetime(uber["Date"]).dt.strftime('%Y-%m-%d-%H')
#### weather: feature selection
os.chdir("/Users/shuozhang/Desktop/capstone")
weather=pd.read_csv("weather_2014_04.csv", index_col=False)
weather.columns=[c.replace(" ", "") for c in weather.columns]
weather=weather.drop_duplicates(subset='IDX', keep='first')
weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"],format="%Y%m%d%H%M")
weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"]).dt.strftime('%Y-%m-%d-%H')
weather.drop([col for col in ["SPD","DEWP","IDX", "SD"] if col in weather], axis=1, inplace=True)
weather.columns=['Date', 'Temp', 'Rain']
weather=weather.fillna(0)
#### data merge
data=pd.concat([yellow, green, uber], axis=0)
dataall=pd.merge(data, weather, how='left', on=['Date'])
dataall.to_csv('may_data_all.csv', sep='\t')
#### apply the same process to april, june, july, august, september
#### dimension reduction: transfer coordinates to zipcode
os.chdir("/Users/shuozhang/Desktop/data")
raw_data=pd.read_csv('may_data_all.csv', sep='\t', index_col=False)
data=raw_data.loc[map(lambda x,y: x >=(-75) and x<=(-72) and y >= 40 and y <= 41.5, \
may.Lon, may.Lat),:] # pick a wide range of coordinates to include the whole new york city
data['Lat']=map(lambda x: round(x,3), data['Lat']) # round lat, lon to reduce number of google map API call
data['Lon']=map(lambda x: round(x,3), data['Lon'])
data["Lat"]=data["Lat"].astype(str)
data["Lon"]=data["Lon"].astype(str)
data["Coord"]=data[["Lat", "Lon"]].apply(lambda x: ",".join(x), axis=1)
data.to_csv('may_data.csv', sep='\t', index_col=False)
##### apply the same process to april, june, july, august, september
#### Google map API call
apr=pd.read_csv('apr_data.csv', sep='\t', index_col=False)
may=pd.read_csv('may_data.csv', sep='\t', index_col=False)
june=pd.read_csv('june_data.csv', sep='\t', index_col=False)
july=pd.read_csv('july_data.csv', sep='\t', index_col=False)
aug=pd.read_csv('aug_data.csv', sep='\t', index_col=False)
sep=pd.read_csv('sep_data.csv', sep='\t', index_col=False)
ls_coord=np.concatenate((apr["Coord"],may["Coord"], june["Coord"], july["Coord"],\
aug["Coord"], sep["Coord"]), axis=0)
df=pd.DataFrame(list(set(ls_coord)))
#### use google map api to read zipcode
from geopy.geocoders import Nominatim
import re
import time
import csv
geolocator = Nominatim()
zipcode = []
zipcode_all = []
for n in range(0,df.shape[0]):
location = geolocator.reverse(df[n], timeout=50)
if location.address == None:
zipcode = "NaN"
else:
num = str(re.findall(r"\D(\d{5})\D", location.address))
num = str(filter(str.isdigit, num))
zipcode = num
output = str(df[n]) + "," + zipcode
print output
time.sleep(1)
with open(r"zipcode_output.csv", 'a') as f:
writer = csv.writer(f,delimiter='\t')
writer.writerow([output])
#### data merge
zipcode=pd.read_csv('zipoutput.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'})
may_raw=pd.read_csv('may_data.csv', sep='\t', index_col=False)
may=pd.merge(may_raw, zipcode, how='left', on=['Coord'])
#### apply the same process to april, june, july, august, september
nyc_zipcode=pd.read_csv('nyc_zipcode.csv', sep='\t', index_col=False, dtype={'Zipcode':'S10'})
may=may[may['zipcode'].isin(nyc_zipcode['Zipcode'])] # filter nyc-only zipcode
may_aggre=may.groupby(['Date', 'Temp', 'Rain', 'zipcode']).size().reset_index(name='count')
may_aggre.to_csv('may_final_data.csv', sep='\t', index_col=False)
#### apply the same process to april, june, july, august, september, and finally combine together as data_final
#### add sub features
import numpy as np
import pandas as pd
import os
os.chdir("/Users/shuozhang/Desktop/data")
data_final=pd.read_csv('data_all_final.csv', sep='\t', encoding='utf-8', index_col=0, dtype={'zipcode':'S10'})
date=pd.to_datetime(data_final['Date'], format='%Y-%m-%d-%H')
data_final['Month']=date.dt.month
data_final['Day']=date.dt.day
data_final['Hour']=date.dt.hour
temp=pd.DatetimeIndex(date)
data_final['Weekday']=temp.weekday
data_model=data_final[['Month', 'Weekday', 'Hour', 'Temp', 'Rain', 'zipcode', 'count']]
data_model.to_csv('nycmodeldata.csv', sep='\t', encoding='utf-8', index_col=False)
@KhondokerTanvirHossain
Copy link

why are you filtering the zipcode at line 120

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment