shuozhang1985/Data Pre-processing

## Data Pre-processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

#### yellow taxi: feature selection

os.chdir("/Users/shuozhang/Desktop/capstone/yellow taxi")
yellow_raw=pd.read_csv("yellow_tripdata_2014-05.csv", index_col=False)
yellow_raw.columns = [c.replace(" ", "") for c in yellow_raw.columns] # remove space in column name
yellow=yellow_raw[['pickup_datetime', 'pickup_latitude', 'pickup_longitude']]
yellow.columns=['Date', 'Lat', 'Lon'] # rename for future merge
yellow["Type"]="yellow" # EDA analysis label
yellow["Date"]=yellow["Date"].str[0:13].replace("\s+","-", regex=True)

#### green taxi: feature selection

os.chdir("/Users/shuozhang/Desktop/capstone/green taxi")
green_raw=pd.read_csv("green_tripdata_2014-05.csv", index_col=False)
green_raw.columns=[c.replace(" ", "") for c in green_raw.columns]
green_raw['lpep_pickup_datetime']=green_raw['VendorID'] # the first 3 columns are mislabeled
green_raw.index=np.arange(green_raw.shape[0])  # index is wrong
green=green_raw[['lpep_pickup_datetime', 'Pickup_longitude', 'Pickup_latitude']]
green.columns=['Date', 'Lon', 'Lat']
green["Type"]="green"
green["Date"]=green["Date"].str[0:13].replace("\s+","-", regex=True)

#### uber: feature selection

os.chdir("/Users/shuozhang/Desktop/capstone/uber", index_col=False)
uber=pd.read_csv("uber-raw-data-may14.csv")
uber.columns=['Date', 'Lat', 'Lon', 'Base']
uber.drop(['Base'], axis=1, inplace=True)
uber["Type"]="uber"
uber["Date"]=pd.to_datetime(uber["Date"]).dt.strftime('%Y-%m-%d-%H')

#### weather: feature selection

os.chdir("/Users/shuozhang/Desktop/capstone")
weather=pd.read_csv("weather_2014_04.csv", index_col=False)
weather.columns=[c.replace(" ", "") for c in weather.columns]
weather=weather.drop_duplicates(subset='IDX', keep='first')
weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"],format="%Y%m%d%H%M")
weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"]).dt.strftime('%Y-%m-%d-%H')
weather.drop([col for col in ["SPD","DEWP","IDX", "SD"] if col in weather], axis=1, inplace=True)
weather.columns=['Date', 'Temp', 'Rain']
weather=weather.fillna(0)

#### data merge

data=pd.concat([yellow, green, uber], axis=0)
dataall=pd.merge(data, weather, how='left', on=['Date'])
dataall.to_csv('may_data_all.csv', sep='\t')

#### apply the same process to april, june, july, august, september


#### dimension reduction: transfer coordinates to zipcode

os.chdir("/Users/shuozhang/Desktop/data")
raw_data=pd.read_csv('may_data_all.csv', sep='\t', index_col=False)

data=raw_data.loc[map(lambda x,y: x >=(-75) and x<=(-72) and y >= 40 and y <= 41.5, \
                  may.Lon, may.Lat),:] # pick a wide range of coordinates to include the whole new york city
data['Lat']=map(lambda x: round(x,3), data['Lat']) # round lat, lon to reduce number of google map API call
data['Lon']=map(lambda x: round(x,3), data['Lon'])
data["Lat"]=data["Lat"].astype(str)
data["Lon"]=data["Lon"].astype(str)
data["Coord"]=data[["Lat", "Lon"]].apply(lambda x: ",".join(x), axis=1)
data.to_csv('may_data.csv', sep='\t', index_col=False)

##### apply the same process to april, june, july, august, september

#### Google map API call
apr=pd.read_csv('apr_data.csv', sep='\t', index_col=False)
may=pd.read_csv('may_data.csv', sep='\t', index_col=False)
june=pd.read_csv('june_data.csv', sep='\t', index_col=False)
july=pd.read_csv('july_data.csv', sep='\t', index_col=False)
aug=pd.read_csv('aug_data.csv', sep='\t', index_col=False)
sep=pd.read_csv('sep_data.csv', sep='\t', index_col=False)
ls_coord=np.concatenate((apr["Coord"],may["Coord"], june["Coord"], july["Coord"],\
                   aug["Coord"], sep["Coord"]), axis=0)
df=pd.DataFrame(list(set(ls_coord)))

#### use google map api to read zipcode

from geopy.geocoders import Nominatim
import re
import time
import csv
geolocator = Nominatim()
zipcode = []
zipcode_all = []

for n in range(0,df.shape[0]):
   location = geolocator.reverse(df[n], timeout=50)
   if location.address == None:
       zipcode = "NaN"
   else:
       num = str(re.findall(r"\D(\d{5})\D", location.address))
       num = str(filter(str.isdigit, num))
       zipcode = num
   output = str(df[n]) + "," + zipcode
   print output
   time.sleep(1)
   with open(r"zipcode_output.csv", 'a') as f:
       writer = csv.writer(f,delimiter='\t')
       writer.writerow([output])

#### data merge

zipcode=pd.read_csv('zipoutput.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'})
may_raw=pd.read_csv('may_data.csv', sep='\t', index_col=False)
may=pd.merge(may_raw, zipcode, how='left', on=['Coord'])

#### apply the same process to april, june, july, august, september


nyc_zipcode=pd.read_csv('nyc_zipcode.csv', sep='\t', index_col=False, dtype={'Zipcode':'S10'})
may=may[may['zipcode'].isin(nyc_zipcode['Zipcode'])] # filter nyc-only zipcode
may_aggre=may.groupby(['Date', 'Temp', 'Rain', 'zipcode']).size().reset_index(name='count')
may_aggre.to_csv('may_final_data.csv', sep='\t', index_col=False)

#### apply the same process to april, june, july, august, september, and finally combine together as data_final

#### add sub features

import numpy as np
import pandas as pd
import os
os.chdir("/Users/shuozhang/Desktop/data")
data_final=pd.read_csv('data_all_final.csv', sep='\t', encoding='utf-8', index_col=0, dtype={'zipcode':'S10'})
date=pd.to_datetime(data_final['Date'], format='%Y-%m-%d-%H')
data_final['Month']=date.dt.month
data_final['Day']=date.dt.day
data_final['Hour']=date.dt.hour
temp=pd.DatetimeIndex(date)
data_final['Weekday']=temp.weekday
data_model=data_final[['Month', 'Weekday', 'Hour', 'Temp', 'Rain', 'zipcode', 'count']]
data_model.to_csv('nycmodeldata.csv', sep='\t', encoding='utf-8', index_col=False)
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import os

	#### yellow taxi: feature selection

	os.chdir("/Users/shuozhang/Desktop/capstone/yellow taxi")
	yellow_raw=pd.read_csv("yellow_tripdata_2014-05.csv", index_col=False)
	yellow_raw.columns = [c.replace(" ", "") for c in yellow_raw.columns] # remove space in column name
	yellow=yellow_raw[['pickup_datetime', 'pickup_latitude', 'pickup_longitude']]
	yellow.columns=['Date', 'Lat', 'Lon'] # rename for future merge
	yellow["Type"]="yellow" # EDA analysis label
	yellow["Date"]=yellow["Date"].str[0:13].replace("\s+","-", regex=True)

	#### green taxi: feature selection

	os.chdir("/Users/shuozhang/Desktop/capstone/green taxi")
	green_raw=pd.read_csv("green_tripdata_2014-05.csv", index_col=False)
	green_raw.columns=[c.replace(" ", "") for c in green_raw.columns]
	green_raw['lpep_pickup_datetime']=green_raw['VendorID'] # the first 3 columns are mislabeled
	green_raw.index=np.arange(green_raw.shape[0]) # index is wrong
	green=green_raw[['lpep_pickup_datetime', 'Pickup_longitude', 'Pickup_latitude']]
	green.columns=['Date', 'Lon', 'Lat']
	green["Type"]="green"
	green["Date"]=green["Date"].str[0:13].replace("\s+","-", regex=True)

	#### uber: feature selection

	os.chdir("/Users/shuozhang/Desktop/capstone/uber", index_col=False)
	uber=pd.read_csv("uber-raw-data-may14.csv")
	uber.columns=['Date', 'Lat', 'Lon', 'Base']
	uber.drop(['Base'], axis=1, inplace=True)
	uber["Type"]="uber"
	uber["Date"]=pd.to_datetime(uber["Date"]).dt.strftime('%Y-%m-%d-%H')

	#### weather: feature selection

	os.chdir("/Users/shuozhang/Desktop/capstone")
	weather=pd.read_csv("weather_2014_04.csv", index_col=False)
	weather.columns=[c.replace(" ", "") for c in weather.columns]
	weather=weather.drop_duplicates(subset='IDX', keep='first')
	weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"],format="%Y%m%d%H%M")
	weather["YR--MODAHRMN"]=pd.to_datetime(weather["YR--MODAHRMN"]).dt.strftime('%Y-%m-%d-%H')
	weather.drop([col for col in ["SPD","DEWP","IDX", "SD"] if col in weather], axis=1, inplace=True)
	weather.columns=['Date', 'Temp', 'Rain']
	weather=weather.fillna(0)

	#### data merge

	data=pd.concat([yellow, green, uber], axis=0)
	dataall=pd.merge(data, weather, how='left', on=['Date'])
	dataall.to_csv('may_data_all.csv', sep='\t')

	#### apply the same process to april, june, july, august, september



	#### dimension reduction: transfer coordinates to zipcode

	os.chdir("/Users/shuozhang/Desktop/data")
	raw_data=pd.read_csv('may_data_all.csv', sep='\t', index_col=False)

	data=raw_data.loc[map(lambda x,y: x >=(-75) and x<=(-72) and y >= 40 and y <= 41.5, \
	may.Lon, may.Lat),:] # pick a wide range of coordinates to include the whole new york city
	data['Lat']=map(lambda x: round(x,3), data['Lat']) # round lat, lon to reduce number of google map API call
	data['Lon']=map(lambda x: round(x,3), data['Lon'])
	data["Lat"]=data["Lat"].astype(str)
	data["Lon"]=data["Lon"].astype(str)
	data["Coord"]=data[["Lat", "Lon"]].apply(lambda x: ",".join(x), axis=1)
	data.to_csv('may_data.csv', sep='\t', index_col=False)

	##### apply the same process to april, june, july, august, september

	#### Google map API call
	apr=pd.read_csv('apr_data.csv', sep='\t', index_col=False)
	may=pd.read_csv('may_data.csv', sep='\t', index_col=False)
	june=pd.read_csv('june_data.csv', sep='\t', index_col=False)
	july=pd.read_csv('july_data.csv', sep='\t', index_col=False)
	aug=pd.read_csv('aug_data.csv', sep='\t', index_col=False)
	sep=pd.read_csv('sep_data.csv', sep='\t', index_col=False)
	ls_coord=np.concatenate((apr["Coord"],may["Coord"], june["Coord"], july["Coord"],\
	aug["Coord"], sep["Coord"]), axis=0)
	df=pd.DataFrame(list(set(ls_coord)))

	#### use google map api to read zipcode

	from geopy.geocoders import Nominatim
	import re
	import time
	import csv
	geolocator = Nominatim()
	zipcode = []
	zipcode_all = []

	for n in range(0,df.shape[0]):
	location = geolocator.reverse(df[n], timeout=50)
	if location.address == None:
	zipcode = "NaN"
	else:
	num = str(re.findall(r"\D(\d{5})\D", location.address))
	num = str(filter(str.isdigit, num))
	zipcode = num
	output = str(df[n]) + "," + zipcode
	print output
	time.sleep(1)
	with open(r"zipcode_output.csv", 'a') as f:
	writer = csv.writer(f,delimiter='\t')
	writer.writerow([output])

	#### data merge

	zipcode=pd.read_csv('zipoutput.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'})
	may_raw=pd.read_csv('may_data.csv', sep='\t', index_col=False)
	may=pd.merge(may_raw, zipcode, how='left', on=['Coord'])

	#### apply the same process to april, june, july, august, september


	nyc_zipcode=pd.read_csv('nyc_zipcode.csv', sep='\t', index_col=False, dtype={'Zipcode':'S10'})
	may=may[may['zipcode'].isin(nyc_zipcode['Zipcode'])] # filter nyc-only zipcode
	may_aggre=may.groupby(['Date', 'Temp', 'Rain', 'zipcode']).size().reset_index(name='count')
	may_aggre.to_csv('may_final_data.csv', sep='\t', index_col=False)

	#### apply the same process to april, june, july, august, september, and finally combine together as data_final

	#### add sub features

	import numpy as np
	import pandas as pd
	import os
	os.chdir("/Users/shuozhang/Desktop/data")
	data_final=pd.read_csv('data_all_final.csv', sep='\t', encoding='utf-8', index_col=0, dtype={'zipcode':'S10'})
	date=pd.to_datetime(data_final['Date'], format='%Y-%m-%d-%H')
	data_final['Month']=date.dt.month
	data_final['Day']=date.dt.day
	data_final['Hour']=date.dt.hour
	temp=pd.DatetimeIndex(date)
	data_final['Weekday']=temp.weekday
	data_model=data_final[['Month', 'Weekday', 'Hour', 'Temp', 'Rain', 'zipcode', 'count']]
	data_model.to_csv('nycmodeldata.csv', sep='\t', encoding='utf-8', index_col=False)