Last active
May 17, 2018 09:11
Star
You must be signed in to star a gist
The task at hand is to predict whether a given blight ticket will be paid on time. Blight violations are issued by the city of Detroit to individuals who allow their properties to remain in a deteriorated condition. Data has been obtained from Detroit Open Data Portal.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import numpy as np | |
import pandas as pd | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import roc_auc_score | |
from datetime import datetime | |
df=pd.read_csv("train.csv") | |
#Finding sum of null values in each column of df DataFrame | |
#print df.isnull().sum() | |
#OR | |
#print len(df)-df.count() | |
#df.count() gives length of non-NaN values of each column in the df DataFrame | |
list(df) | |
train_data=df[pd.notnull(df.compliance)] | |
for n,name in enumerate(train_data.columns): | |
print n,name | |
address=pd.read_csv("addresses.csv") | |
latlon=pd.read_csv("latlons.csv") | |
lat=latlon.set_index("address") | |
train_data = train_data[~train_data['hearing_date'].isnull()] | |
train_data = train_data[~train_data['hearing_date'].isnull()] | |
#function to return gap in days between the hearing date and the ticket issue date | |
#the mean of it is found to be 73 so in case gap cannot be calculated, we use 73 | |
def time_gap(hearingdate_str,ticket_issueddate_str): | |
if not hearingdate_str: | |
return 73 | |
hearing_date=datetime.strptime(hearingdate_str,"%Y-%m-%d %H:%M:%S") | |
ticket_issued_date=datetime.strptime(ticket_issueddate_str,"%Y-%m-%d %H:%M:%S") | |
gap=hearing_date-ticket_issued_date | |
return gap.days | |
#example demo of how time_gap() works | |
gap=datetime.strptime("2015-3-22 06:56:38","%Y-%m-%d %H:%M:%S")-datetime.strptime("2013-4-6 13:00:04","%Y-%m-%d %H:%M:%S") | |
gap.days | |
train_data['time_gap']=train_data.apply(lambda row: time_gap(row['hearing_date'],row['ticket_issued_date']),axis=1) | |
print train_data.time_gap.mean() #72.6474103211 before adding the return 73 statement in time_gap() | |
test_data=pd.read_csv("test.csv") | |
test_data.shape | |
len(train_data[~train_data['hearing_date'].isnull()]) | |
##function to be run for model-building and predictions | |
def blight_model(): | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import MinMaxScaler | |
from datetime import datetime | |
def time_gap(hearingdate_str,ticket_issueddate_str): | |
if not hearingdate_str: | |
return 73 | |
if type(hearingdate_str)=='str': | |
hearing_date=datetime.strptime(hearingdate_str,"%Y-%m-%d %H:%M:%S") | |
ticket_issued_date=datetime.strptime(ticket_issueddate_str,"%Y-%m-%d %H:%M:%S") | |
gap=hearing_date-ticket_issued_date | |
return gap.days | |
train_data=pd.read_csv("train.csv",encoding="ISO-8859-1") | |
test_data=pd.read_csv("test.csv") | |
train_data=train_data[(train_data['compliance']==0)|(train_data['compliance']==1)] | |
address=pd.read_csv("addresses.csv") | |
latlon=pd.read_csv("latlons.csv") | |
address=address.set_index('address').join(latlon.set_index('address'),how='left') | |
train_data=train_data.set_index('ticket_id').join(address.set_index('ticket_id')) | |
test_data = test_data.set_index('ticket_id').join(address.set_index('ticket_id')) | |
train_data=train_data[~train_data['hearing_date'].isnull()] | |
train_data['time_gap'] = train_data.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1) | |
test_data['time_gap'] = test_data.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1) | |
#categorical features that need to be replaced by dummy variables | |
features_categorical = ['agency_name'] | |
#filling NAs in lat,lon,state columns in train set | |
train_data.lat.fillna(method='pad',inplace=True) | |
train_data.lon.fillna(method='pad',inplace=True) | |
train_data.state.fillna(method='pad',inplace=True) | |
#filling NAs in lat,lon,state columns in test set | |
test_data.lat.fillna(method='pad',inplace=True) | |
test_data.lon.fillna(method='pad',inplace=True) | |
test_data.state.fillna(method='pad',inplace=True) | |
#replacing categorical variables with dummy variables | |
train_data=pd.get_dummies(train_data,columns=features_categorical) | |
test_data=pd.get_dummies(test_data,columns=features_categorical) | |
#features to be removed from train set as they are missing in test set | |
features_to_remove_train=['balance_due','collection_status', | |
'compliance_detail', | |
'payment_amount', | |
'payment_date', | |
'payment_status'] | |
#features to be removed from both train, test sets | |
features_to_remove_both=['state','violator_name','disposition', | |
'zip_code','country','city','inspector_name', 'violation_street_number', 'violation_street_name', | |
'violation_zip_code', 'violation_description','time_gap', | |
'mailing_address_str_number', 'mailing_address_str_name', | |
'non_us_str_code', 'clean_up_cost', | |
'ticket_issued_date', 'hearing_date', 'grafitti_status', 'violation_code'] | |
train_data.drop(features_to_remove_train,axis=1,inplace=True) | |
train_data.drop(features_to_remove_both,axis=1,inplace=True) | |
test_data.drop(features_to_remove_both,axis=1,inplace=True) | |
features=train_data.columns.drop('compliance') | |
features_set=set(features) | |
for feature in set(features): | |
if feature not in test_data: | |
features_set.remove(feature) | |
features=list(features_set) | |
print "INITIAL :", features | |
print "FINAL :",features | |
X_train=train_data[features] | |
print X_train.isnull().sum() | |
y_train=train_data['compliance'] | |
X_test=test_data[features] | |
print len(features) | |
#scaler=MinMaxScaler() | |
#X_train=scaler.fit_transform(X_train) | |
#X_test=scaler.transform(X_test) | |
clf =RandomForestClassifier(n_estimators=100,max_depth=8,n_jobs=-1,random_state=0) | |
X_train,X_valid,y_train,y_valid = train_test_split(train_data[features],train_data['compliance'],test_size= 0.2,) | |
clf.fit(train_data[features],train_data['compliance']) | |
print(roc_auc_score(y_valid,clf.predict_proba(X_valid)[:,1],)) | |
y_pred = clf.predict_proba(test_data[features])[:,1] | |
test_data['compliance'] = y_pred | |
blight_model() | |
##roc_auc_score :0.7946978282463928 | |
#AUC (Area Under Curve) is a ranking metric for binary classifications | |
#ROC- receiver operating characteristic | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment