Skip to content

Instantly share code, notes, and snippets.

@parvathysarat
Last active May 17, 2018 09:11
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save parvathysarat/a82c0a8c0345d2a859368bb5ca5b5e4a to your computer and use it in GitHub Desktop.
The task at hand is to predict whether a given blight ticket will be paid on time. Blight violations are issued by the city of Detroit to individuals who allow their properties to remain in a deteriorated condition. Data has been obtained from Detroit Open Data Portal.
# coding: utf-8
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from datetime import datetime
df=pd.read_csv("train.csv")
#Finding sum of null values in each column of df DataFrame
#print df.isnull().sum()
#OR
#print len(df)-df.count()
#df.count() gives length of non-NaN values of each column in the df DataFrame
list(df)
train_data=df[pd.notnull(df.compliance)]
for n,name in enumerate(train_data.columns):
print n,name
address=pd.read_csv("addresses.csv")
latlon=pd.read_csv("latlons.csv")
lat=latlon.set_index("address")
train_data = train_data[~train_data['hearing_date'].isnull()]
train_data = train_data[~train_data['hearing_date'].isnull()]
#function to return gap in days between the hearing date and the ticket issue date
#the mean of it is found to be 73 so in case gap cannot be calculated, we use 73
def time_gap(hearingdate_str,ticket_issueddate_str):
if not hearingdate_str:
return 73
hearing_date=datetime.strptime(hearingdate_str,"%Y-%m-%d %H:%M:%S")
ticket_issued_date=datetime.strptime(ticket_issueddate_str,"%Y-%m-%d %H:%M:%S")
gap=hearing_date-ticket_issued_date
return gap.days
#example demo of how time_gap() works
gap=datetime.strptime("2015-3-22 06:56:38","%Y-%m-%d %H:%M:%S")-datetime.strptime("2013-4-6 13:00:04","%Y-%m-%d %H:%M:%S")
gap.days
train_data['time_gap']=train_data.apply(lambda row: time_gap(row['hearing_date'],row['ticket_issued_date']),axis=1)
print train_data.time_gap.mean() #72.6474103211 before adding the return 73 statement in time_gap()
test_data=pd.read_csv("test.csv")
test_data.shape
len(train_data[~train_data['hearing_date'].isnull()])
##function to be run for model-building and predictions
def blight_model():
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
def time_gap(hearingdate_str,ticket_issueddate_str):
if not hearingdate_str:
return 73
if type(hearingdate_str)=='str':
hearing_date=datetime.strptime(hearingdate_str,"%Y-%m-%d %H:%M:%S")
ticket_issued_date=datetime.strptime(ticket_issueddate_str,"%Y-%m-%d %H:%M:%S")
gap=hearing_date-ticket_issued_date
return gap.days
train_data=pd.read_csv("train.csv",encoding="ISO-8859-1")
test_data=pd.read_csv("test.csv")
train_data=train_data[(train_data['compliance']==0)|(train_data['compliance']==1)]
address=pd.read_csv("addresses.csv")
latlon=pd.read_csv("latlons.csv")
address=address.set_index('address').join(latlon.set_index('address'),how='left')
train_data=train_data.set_index('ticket_id').join(address.set_index('ticket_id'))
test_data = test_data.set_index('ticket_id').join(address.set_index('ticket_id'))
train_data=train_data[~train_data['hearing_date'].isnull()]
train_data['time_gap'] = train_data.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1)
test_data['time_gap'] = test_data.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1)
#categorical features that need to be replaced by dummy variables
features_categorical = ['agency_name']
#filling NAs in lat,lon,state columns in train set
train_data.lat.fillna(method='pad',inplace=True)
train_data.lon.fillna(method='pad',inplace=True)
train_data.state.fillna(method='pad',inplace=True)
#filling NAs in lat,lon,state columns in test set
test_data.lat.fillna(method='pad',inplace=True)
test_data.lon.fillna(method='pad',inplace=True)
test_data.state.fillna(method='pad',inplace=True)
#replacing categorical variables with dummy variables
train_data=pd.get_dummies(train_data,columns=features_categorical)
test_data=pd.get_dummies(test_data,columns=features_categorical)
#features to be removed from train set as they are missing in test set
features_to_remove_train=['balance_due','collection_status',
'compliance_detail',
'payment_amount',
'payment_date',
'payment_status']
#features to be removed from both train, test sets
features_to_remove_both=['state','violator_name','disposition',
'zip_code','country','city','inspector_name', 'violation_street_number', 'violation_street_name',
'violation_zip_code', 'violation_description','time_gap',
'mailing_address_str_number', 'mailing_address_str_name',
'non_us_str_code', 'clean_up_cost',
'ticket_issued_date', 'hearing_date', 'grafitti_status', 'violation_code']
train_data.drop(features_to_remove_train,axis=1,inplace=True)
train_data.drop(features_to_remove_both,axis=1,inplace=True)
test_data.drop(features_to_remove_both,axis=1,inplace=True)
features=train_data.columns.drop('compliance')
features_set=set(features)
for feature in set(features):
if feature not in test_data:
features_set.remove(feature)
features=list(features_set)
print "INITIAL :", features
print "FINAL :",features
X_train=train_data[features]
print X_train.isnull().sum()
y_train=train_data['compliance']
X_test=test_data[features]
print len(features)
#scaler=MinMaxScaler()
#X_train=scaler.fit_transform(X_train)
#X_test=scaler.transform(X_test)
clf =RandomForestClassifier(n_estimators=100,max_depth=8,n_jobs=-1,random_state=0)
X_train,X_valid,y_train,y_valid = train_test_split(train_data[features],train_data['compliance'],test_size= 0.2,)
clf.fit(train_data[features],train_data['compliance'])
print(roc_auc_score(y_valid,clf.predict_proba(X_valid)[:,1],))
y_pred = clf.predict_proba(test_data[features])[:,1]
test_data['compliance'] = y_pred
blight_model()
##roc_auc_score :0.7946978282463928
#AUC (Area Under Curve) is a ranking metric for binary classifications
#ROC- receiver operating characteristic
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment