Skip to content

Instantly share code, notes, and snippets.

@KanishkVashisht
Last active August 18, 2018 13:12
Show Gist options
  • Save KanishkVashisht/74f1b9ad67e13f278d40049ffab50590 to your computer and use it in GitHub Desktop.
Save KanishkVashisht/74f1b9ad67e13f278d40049ffab50590 to your computer and use it in GitHub Desktop.
"""
Using extreme gradient boosting to predict the price of a taxi from point a to point b using google public data
"""
#importing libs
from numpy import loadtxt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import style
from google.cloud import bigquery
import pandas_gbq as pgbq
import pickle
#initialising variables
limit = True
limit_amt = 2000000
label = 'fare_amount'
features = ['trip_distance','diff']
test_size=0.33
seed=7
projectid = "nyc-yellowcab-data"
table = "abc.cleanedupdata_final"
#load data
if(limit):
query = "SELECT * FROM "+table+" LIMIT "+str(limit_amt)
else:
query = "SELECT * FROM"+table
df = pgbq.read_gbq(query, project_id=projectid, private_key = "./googlekey.json")
#set x and y
X = df[features]
y = df[label]
print("===\n\n\nX data")
print(X.head())
print("\n\n\n===")
print("===\n\n\nY data")
print(y.head())
print("\n\n\n===")
#split dataframe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
#train xgboost
model = XGBRegressor(objective='reg:linear')
model.fit(X_train, y_train)
pickle.dump(model,open('taxiModel.pickle','wb'))
#test the model
##make predictions
accuracy = model.score(X_test,y_test)
print(accuracy)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment