KanishkVashisht/xgb_taxi.py

## xgb_taxi.py
"""
Using extreme gradient boosting to predict the price of a taxi from point a to point b using google public data
"""

#importing libs
from numpy import loadtxt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import style
from google.cloud import bigquery
import pandas_gbq as pgbq
import pickle

#initialising variables
limit = True
limit_amt = 2000000
label = 'fare_amount'
features = ['trip_distance','diff']
test_size=0.33
seed=7
projectid = "nyc-yellowcab-data"
table = "abc.cleanedupdata_final"

#load data
if(limit):
    query = "SELECT * FROM "+table+" LIMIT "+str(limit_amt)
else:
    query = "SELECT * FROM"+table
df = pgbq.read_gbq(query, project_id=projectid, private_key = "./googlekey.json")

#set x and y
X = df[features]
y = df[label]
print("===\n\n\nX data")
print(X.head())
print("\n\n\n===")
print("===\n\n\nY data")
print(y.head())
print("\n\n\n===")

#split dataframe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

#train xgboost
model = XGBRegressor(objective='reg:linear')
model.fit(X_train, y_train)
pickle.dump(model,open('taxiModel.pickle','wb'))

#test the model
##make predictions
accuracy = model.score(X_test,y_test)
print(accuracy)
	"""
	Using extreme gradient boosting to predict the price of a taxi from point a to point b using google public data
	"""

	#importing libs
	from numpy import loadtxt
	from xgboost import XGBRegressor
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score
	import pandas as pd
	import datetime as dt
	import matplotlib.pyplot as plt
	from matplotlib import style
	from google.cloud import bigquery
	import pandas_gbq as pgbq
	import pickle

	#initialising variables
	limit = True
	limit_amt = 2000000
	label = 'fare_amount'
	features = ['trip_distance','diff']
	test_size=0.33
	seed=7
	projectid = "nyc-yellowcab-data"
	table = "abc.cleanedupdata_final"

	#load data
	if(limit):
	query = "SELECT * FROM "+table+" LIMIT "+str(limit_amt)
	else:
	query = "SELECT * FROM"+table
	df = pgbq.read_gbq(query, project_id=projectid, private_key = "./googlekey.json")

	#set x and y
	X = df[features]
	y = df[label]
	print("===\n\n\nX data")
	print(X.head())
	print("\n\n\n===")
	print("===\n\n\nY data")
	print(y.head())
	print("\n\n\n===")

	#split dataframe
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

	#train xgboost
	model = XGBRegressor(objective='reg:linear')
	model.fit(X_train, y_train)
	pickle.dump(model,open('taxiModel.pickle','wb'))

	#test the model
	##make predictions
	accuracy = model.score(X_test,y_test)
	print(accuracy)