Skip to content

Instantly share code, notes, and snippets.

@maheshakya
Last active November 26, 2015 10:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save maheshakya/9a847c90b8b94430c748 to your computer and use it in GitHub Desktop.
Save maheshakya/9a847c90b8b94430c748 to your computer and use it in GitHub Desktop.
;)
"""
Dependencies: Python 2.7 or higher, numpy, scikit-learn
"""
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#Helper functions
def diff(a, b):
b = set(b)
return [aa for aa in a if aa not in b]
#Loading CSV files
#donations = pd.read_csv('Data/donations.csv')
projects = pd.read_csv('Data/projects.csv')
outcomes = pd.read_csv('Data/outcomes.csv')
#resources = pd.read_csv('Data/resources.csv')
sample = pd.read_csv('Data/sampleSubmission.csv')
#essays = pd.read_csv('Data/essays.csv')
print 'Read data files.'
#Sort data according the project ID
#essays = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
outcomes = outcomes.sort('projectid')
#donations = donations.sort('projectid')
#resources = resources.sort('projectid')
#Setting training data and test data indices
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]
#Filling missing values
projects = projects.fillna(method='pad') #'pad' filling is a naive way. We have better methods.
#Set target labels
labels = np.array(outcomes.is_exciting)
#Preprocessing
projects_numeric_columns = ['school_latitude', 'school_longitude',
'fulfillment_labor_materials',
'total_price_excluding_optional_support',
'total_price_including_optional_support']
projects_id_columns = ['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid']
projects_categorial_columns = diff(diff(diff(list(projects.columns), projects_id_columns), projects_numeric_columns),
['date_posted'])
projects_categorial_values = np.array(projects[projects_categorial_columns])
label_encoder = LabelEncoder()
projects_data = label_encoder.fit_transform(projects_categorial_values[:,0])
for i in range(1, projects_categorial_values.shape[1]):
label_encoder = LabelEncoder()
projects_data = np.column_stack((projects_data, label_encoder.fit_transform(projects_categorial_values[:,i])))
projects_data = projects_data.astype(float)
#One hot encoding!
enc = OneHotEncoder()
enc.fit(projects_data)
projects_data = enc.transform(projects_data)
#Predicting
train = projects_data[train_idx]
test = projects_data[test_idx]
clf = LogisticRegression()
clf.fit(train, labels=='t')
preds = clf.predict_proba(test)[:,1]
#Save prediction into a file
sample['is_exciting'] = preds
sample.to_csv('predictions.csv', index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment